From e50ef8d372671def69dd0667ec8f4cb0584b3c0f Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Sun, 8 Oct 2023 05:20:27 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |     0
 cache.json  |     1 +
 favicon.ico |   Bin 0 -> 15086 bytes
 index.css   |   355 +
 index.html  | 57691 ++++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |    39 +
 6 files changed, 58086 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..6a405b79
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2023-10-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2306.08018v3","updated":"2023-10-02T15:27:20Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n  Large Language Models","summary":"  Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a comprehensive instruction dataset\ndesigned for the biomolecular domain. Mol-Instructions encompasses three key\ncomponents: molecule-oriented instructions, protein-oriented instructions, and\nbiomolecular text instructions. Each component aims to improve the\nunderstanding and prediction capabilities of LLMs concerning biomolecular\nfeatures and behaviors. Through extensive instruction tuning experiments on\nLLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large\nmodels' performance in the intricate realm of biomolecular studies, thus\nfostering progress in the biomolecular research community. Mol-Instructions is\npublicly available for ongoing research and will undergo regular updates to\nenhance its applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v3.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions"},{"id":"http://arxiv.org/abs/2309.08448v2","updated":"2023-10-02T15:22:42Z","published":"2023-09-15T14:52:23Z","title":"Advancing the Evaluation of Traditional Chinese Language Models: Towards\n  a Comprehensive Benchmark Suite","summary":"  The evaluation of large language models is an essential task in the field of\nlanguage understanding and generation. As language models continue to advance,\nthe need for effective benchmarks to assess their performance has become\nimperative. In the context of Traditional Chinese, there is a scarcity of\ncomprehensive and diverse benchmarks to evaluate the capabilities of language\nmodels, despite the existence of certain benchmarks such as DRCD, TTQA, CMDQA,\nand FGC dataset. To address this gap, we propose a novel set of benchmarks that\nleverage existing English datasets and are tailored to evaluate language models\nin Traditional Chinese. These benchmarks encompass a wide range of tasks,\nincluding contextual question-answering, summarization, classification, and\ntable understanding. The proposed benchmarks offer a comprehensive evaluation\nframework, enabling the assessment of language models' capabilities across\ndifferent tasks. In this paper, we evaluate the performance of GPT-3.5,\nTaiwan-LLaMa-v1.0, and Model 7-C, our proprietary model, on these benchmarks.\nThe evaluation results highlight that our model, Model 7-C, achieves\nperformance comparable to GPT-3.5 with respect to a part of the evaluated\ncapabilities. In an effort to advance the evaluation of language models in\nTraditional Chinese and stimulate further research in this field, we have\nopen-sourced our benchmark and opened the model for trial.\n","authors":["Chan-Jan Hsu","Chang-Le Liu","Feng-Ting Liao","Po-Chun Hsu","Yi-Chang Chen","Da-shan Shiu"],"pdf_url":"https://arxiv.org/pdf/2309.08448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11259v5","updated":"2023-10-02T15:17:34Z","published":"2023-01-26T17:52:56Z","title":"Domain-Agnostic Molecular Generation with Self-feedback","summary":"  The generation of molecules with desired properties has gained tremendous\npopularity, revolutionizing the way scientists design molecular structures and\nproviding valuable support for chemical and drug design. However, despite the\npotential of language models in molecule generation, they face numerous\nchallenges such as the generation of syntactically or chemically flawed\nmolecules, narrow domain focus, and limitations in creating diverse and\ndirectionally feasible molecules due to a dearth of annotated data or external\nmolecular databases. To tackle these challenges, we introduce MolGen, a\npre-trained molecular language model tailored specifically for molecule\ngeneration. Through the reconstruction of over 100 million molecular SELFIES,\nMolGen internalizes profound structural and grammatical insights. This is\nfurther enhanced by domain-agnostic molecular prefix tuning, fostering robust\nknowledge transfer across diverse domains. Importantly, our self-feedback\nparadigm steers the model away from ``molecular hallucinations'', ensuring\nalignment between the model's estimated probabilities and real-world chemical\npreferences. Extensive experiments on well-known benchmarks underscore MolGen's\noptimization capabilities in properties such as penalized logP, QED, and\nmolecular docking. Additional analyses affirm its proficiency in accurately\ncapturing molecule distributions, discerning intricate structural patterns, and\nefficiently exploring the chemical space. Code is available at\nhttps://github.com/zjunlp/MolGen.\n","authors":["Yin Fang","Ningyu Zhang","Zhuo Chen","Lingbing Guo","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.11259v5.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2303.03124v2","updated":"2023-10-02T15:15:33Z","published":"2023-03-06T13:37:59Z","title":"IFAN: An Explainability-Focused Interaction Framework for Humans and NLP\n  Models","summary":"  Interpretability and human oversight are fundamental pillars of deploying\ncomplex NLP models into real-world applications. However, applying\nexplainability and human-in-the-loop methods requires technical proficiency.\nDespite existing toolkits for model understanding and analysis, options to\nintegrate human feedback are still limited. We propose IFAN, a framework for\nreal-time explanation-based interaction with NLP models. Through IFAN's\ninterface, users can provide feedback to selected model explanations, which is\nthen integrated through adapter layers to align the model with human rationale.\nWe show the system to be effective in debiasing a hate speech classifier with\nminimal impact on performance. IFAN also offers a visual admin system and API\nto manage models (and datasets) as well as control access rights. A demo is\nlive at https://ifan.ml.\n","authors":["Edoardo Mosca","Daryna Dementieva","Tohid Ebrahim Ajdari","Maximilian Kummeth","Kirill Gringauz","Yutong Zhou","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2303.03124v2.pdf","comment":"Accepted to AACL 2023 Demonstration systems Track"},{"id":"http://arxiv.org/abs/2305.14279v3","updated":"2023-10-02T15:09:41Z","published":"2023-05-23T17:25:59Z","title":"Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs","summary":"  Large language models (LLMs) have achieved widespread success on a variety of\nin-context few-shot tasks, but this success is typically evaluated via\ncorrectness rather than consistency. We argue that self-consistency is an\nimportant criteria for valid multi-step reasoning in tasks where the solution\nis composed of the answers to multiple sub-steps. We propose two types of\nself-consistency that are particularly important for multi-step reasoning --\nhypothetical consistency (a model's ability to predict what its output would be\nin a hypothetical other context) and compositional consistency (consistency of\na model's final outputs when intermediate sub-steps are replaced with the\nmodel's outputs for those steps). We demonstrate that multiple variants of the\nGPT-3/-4 models exhibit poor consistency rates across both types of consistency\non a variety of tasks.\n","authors":["Angelica Chen","Jason Phang","Alicia Parrish","Vishakh Padmakumar","Chen Zhao","Samuel R. Bowman","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2305.14279v3.pdf","comment":"Added GPT-4 results"},{"id":"http://arxiv.org/abs/2304.03897v2","updated":"2023-10-02T14:48:45Z","published":"2023-04-08T03:14:19Z","title":"Factify 2: A Multimodal Fake News and Satire News Dataset","summary":"  The internet gives the world an open platform to express their views and\nshare their stories. While this is very valuable, it makes fake news one of our\nsociety's most pressing problems. Manual fact checking process is time\nconsuming, which makes it challenging to disprove misleading assertions before\nthey cause significant harm. This is he driving interest in automatic fact or\nclaim verification. Some of the existing datasets aim to support development of\nautomating fact-checking techniques, however, most of them are text based.\nMulti-modal fact verification has received relatively scant attention. In this\npaper, we provide a multi-modal fact-checking dataset called FACTIFY 2,\nimproving Factify 1 by using new data sources and adding satire articles.\nFactify 2 has 50,000 new data instances. Similar to FACTIFY 1.0, we have three\nbroad categories - support, no-evidence, and refute, with sub-categories based\non the entailment of visual and textual data. We also provide a BERT and Vison\nTransformer based baseline, which achieves 65% F1 score in the test set. The\nbaseline codes and the dataset will be made available at\nhttps://github.com/surya1701/Factify-2.0.\n","authors":["S Suryavardan","Shreyash Mishra","Parth Patwa","Megha Chakraborty","Anku Rani","Aishwarya Reganti","Aman Chadha","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2304.03897v2.pdf","comment":"Defactify2 @AAAI2023"},{"id":"http://arxiv.org/abs/2309.07915v2","updated":"2023-10-02T14:46:01Z","published":"2023-09-14T17:59:17Z","title":"MMICL: Empowering Vision-language Model with Multi-Modal In-Context\n  Learning","summary":"  Since the resurgence of deep learning, vision-language models (VLMs) enhanced\nby large language models (LLMs) have grown exponentially in popularity.\nHowever, while LLMs can utilize extensive background knowledge and task\ninformation with in-context learning, most VLMs still struggle with\nunderstanding complex multi-modal prompts with multiple images, making VLMs\nless effective in downstream vision-language tasks. In this paper, we address\nthe limitation above by 1) introducing MMICL, a new approach to allow the VLM\nto deal with multi-modal inputs efficiently; 2) proposing a novel context\nscheme to augment the in-context learning ability of the VLM; 3) constructing\nthe Multi-modal In-Context Learning (MIC) dataset, designed to enhance the\nVLM's ability to understand complex multi-modal prompts. Our experiments\nconfirm that MMICL achieves new state-of-the-art zero-shot performance on a\nwide range of general vision-language tasks, especially for complex benchmarks,\nincluding MME and MMBench. Our analysis demonstrates that MMICL effectively\ntackles the challenge of complex multi-modal prompt understanding and emerges\nthe impressive ICL ability. Furthermore, we observe that MMICL successfully\nalleviates language bias in VLMs, a common issue for VLMs that often leads to\nhallucination when faced with extensive textual context.\n","authors":["Haozhe Zhao","Zefan Cai","Shuzheng Si","Xiaojian Ma","Kaikai An","Liang Chen","Zixuan Liu","Sheng Wang","Wenjuan Han","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2309.07915v2.pdf","comment":"Code, dataset, checkpoints, and demos are available at\n  https://github.com/PKUnlp-icler/MIC"},{"id":"http://arxiv.org/abs/2303.09892v3","updated":"2023-10-02T14:28:03Z","published":"2023-03-17T11:13:30Z","title":"Memotion 3: Dataset on Sentiment and Emotion Analysis of Codemixed\n  Hindi-English Memes","summary":"  Memes are the new-age conveyance mechanism for humor on social media sites.\nMemes often include an image and some text. Memes can be used to promote\ndisinformation or hatred, thus it is crucial to investigate in details. We\nintroduce Memotion 3, a new dataset with 10,000 annotated memes. Unlike other\nprevalent datasets in the domain, including prior iterations of Memotion,\nMemotion 3 introduces Hindi-English Codemixed memes while prior works in the\narea were limited to only the English memes. We describe the Memotion task, the\ndata collection and the dataset creation methodologies. We also provide a\nbaseline for the task. The baseline code and dataset will be made available at\nhttps://github.com/Shreyashm16/Memotion-3.0\n","authors":["Shreyash Mishra","S Suryavardan","Parth Patwa","Megha Chakraborty","Anku Rani","Aishwarya Reganti","Aman Chadha","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2303.09892v3.pdf","comment":"Defactify2 @AAAI"},{"id":"http://arxiv.org/abs/2302.14838v2","updated":"2023-10-02T14:26:42Z","published":"2023-02-28T18:37:25Z","title":"EvoPrompting: Language Models for Code-Level Neural Architecture Search","summary":"  Given the recent impressive accomplishments of language models (LMs) for code\ngeneration, we explore the use of LMs as adaptive mutation and crossover\noperators for an evolutionary neural architecture search (NAS) algorithm. While\nNAS still proves too difficult a task for LMs to succeed at solely through\nprompting, we find that the combination of evolutionary prompt engineering with\nsoft prompt-tuning, a method we term EvoPrompting, consistently finds diverse\nand high performing models. We first demonstrate that EvoPrompting is effective\non the computationally efficient MNIST-1D dataset, where EvoPrompting produces\nconvolutional architecture variants that outperform both those designed by\nhuman experts and naive few-shot prompting in terms of accuracy and model size.\nWe then apply our method to searching for graph neural networks on the CLRS\nAlgorithmic Reasoning Benchmark, where EvoPrompting is able to design novel\narchitectures that outperform current state-of-the-art models on 21 out of 30\nalgorithmic reasoning tasks while maintaining similar model size. EvoPrompting\nis successful at designing accurate and efficient neural network architectures\nacross a variety of machine learning tasks, while also being general enough for\neasy adaptation to other tasks beyond neural network design.\n","authors":["Angelica Chen","David M. Dohan","David R. So"],"pdf_url":"https://arxiv.org/pdf/2302.14838v2.pdf","comment":"To be presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.16183v2","updated":"2023-10-02T13:04:59Z","published":"2023-05-25T15:39:46Z","title":"Passive learning of active causal strategies in agents and language\n  models","summary":"  What can be learned about causality and experimentation from passive data?\nThis question is salient given recent successes of passively-trained language\nmodels in interactive domains such as tool use. Passive learning is inherently\nlimited. However, we show that purely passive learning can in fact allow an\nagent to learn generalizable strategies for determining and using causal\nstructures, as long as the agent can intervene at test time. We formally\nillustrate that learning a strategy of first experimenting, then seeking goals,\ncan allow generalization from passive learning in principle. We then show\nempirically that agents trained via imitation on expert data can indeed\ngeneralize at test time to infer and use causal links which are never present\nin the training data; these agents can also generalize experimentation\nstrategies to novel variable sets never observed in training. We then show that\nstrategies for causal intervention and exploitation can be generalized from\npassive data even in a more complex environment with high-dimensional\nobservations, with the support of natural language explanations. Explanations\ncan even allow passive learners to generalize out-of-distribution from\nperfectly-confounded training data. Finally, we show that language models,\ntrained only on passive next-word prediction, can generalize causal\nintervention strategies from a few-shot prompt containing examples of\nexperimentation, together with explanations and reasoning. These results\nhighlight the surprising power of passive learning of active causal strategies,\nand may help to understand the behaviors and capabilities of language models.\n","authors":["Andrew Kyle Lampinen","Stephanie C Y Chan","Ishita Dasgupta","Andrew J Nam","Jane X Wang"],"pdf_url":"https://arxiv.org/pdf/2305.16183v2.pdf","comment":"Advances in Neural Information Processing Systems (NeurIPS 2023). 10\n  pages main text"},{"id":"http://arxiv.org/abs/2307.06930v2","updated":"2023-10-02T11:58:10Z","published":"2023-07-13T17:51:58Z","title":"mBLIP: Efficient Bootstrapping of Multilingual Vision-LLMs","summary":"  Modular vision-language models (Vision-LLMs) align pretrained image encoders\nwith frozen large language models (LLMs), representing a computationally much\nmore efficient alternative to end-to-end training of large vision-language\nmodels from scratch, which is prohibitively expensive for most researchers and\npractitioners. Vision-LLMs instead post-hoc condition LLMs to `understand' the\noutput of an image encoder. With the abundance of readily available\nhigh-quality English image-text data as well as monolingual English LLMs, the\nresearch focus has been on English-only Vision-LLMs. Multilingual\nvision-language models are still predominantly obtained via expensive\nend-to-end pretraining, resulting in comparatively smaller models, trained on\nlimited multilingual image data supplemented with text-only multilingual\ncorpora. In this work, we present mBLIP, the first multilingual Vision-LLM,\nwhich we obtain in a computationally efficient manner -- on consumer hardware\nand using only a few million training examples -- by leveraging a pretrained\nmultilingual LLM. To this end, we \\textit{re-align} an image encoder previously\ntuned to an English LLM to a new, multilingual LLM -- for this, we leverage\nmultilingual data from a mix of vision-and-language tasks, which we obtain by\nmachine-translating high-quality English data to 95 languages. On the IGLUE\nbenchmark, mBLIP yields results competitive with state-of-the-art models.\nMoreover, in image captioning on XM3600, mBLIP (zero-shot) even outperforms\nPaLI-X (a model with 55B parameters). Compared to these very large multilingual\nvision-language models trained from scratch, we obtain mBLIP by training orders\nof magnitude fewer parameters on magnitudes less data. We release our model and\ncode at \\url{https://github.com/gregor-ge/mBLIP}.\n","authors":["Gregor Geigle","Abhay Jain","Radu Timofte","Goran Glavaš"],"pdf_url":"https://arxiv.org/pdf/2307.06930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15806v2","updated":"2023-10-02T10:32:35Z","published":"2023-09-27T17:29:41Z","title":"Lyra: Orchestrating Dual Correction in Automated Theorem Proving","summary":"  Large Language Models (LLMs) present an intriguing avenue for exploration in\nthe field of formal theorem proving. Nevertheless, their full potential,\nparticularly concerning the mitigation of hallucinations and refinement through\nprover error messages, remains an area that has yet to be thoroughly\ninvestigated. To enhance the effectiveness of LLMs in the field, we introduce\nthe Lyra, a new framework that employs two distinct correction mechanisms: Tool\nCorrection (TC) and Conjecture Correction (CC). To implement Tool Correction in\nthe post-processing of formal proofs, we leverage prior knowledge to utilize\npredefined prover tools (e.g., Sledgehammer) for guiding the replacement of\nincorrect tools. Tool Correction significantly contributes to mitigating\nhallucinations, thereby improving the overall accuracy of the proof. In\naddition, we introduce Conjecture Correction, an error feedback mechanism\ndesigned to interact with prover to refine formal proof conjectures with prover\nerror messages. Compared to the previous refinement framework, the proposed\nConjecture Correction refines generation with instruction but does not collect\npaired (generation, error & refinement) prompts. Our method has achieved\nstate-of-the-art (SOTA) performance on both miniF2F validation (48.0% -> 55.3%)\nand test (45.5% -> 51.2%). We also present 3 IMO problems solved by Lyra. We\nbelieve Tool Correction (post-process for hallucination mitigation) and\nConjecture Correction (subgoal adjustment from interaction with environment)\ncould provide a promising avenue for future research in this field.\n","authors":["Chuanyang Zheng","Haiming Wang","Enze Xie","Zhengying Liu","Jiankai Sun","Huajian Xin","Jianhao Shen","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2309.15806v2.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2309.17446v2","updated":"2023-10-02T09:54:50Z","published":"2023-09-29T17:57:00Z","title":"L2CEval: Evaluating Language-to-Code Generation Capabilities of Large\n  Language Models","summary":"  Recently, large language models (LLMs), especially those that are pretrained\non code, have demonstrated strong capabilities in generating programs from\nnatural language inputs in a few-shot or even zero-shot manner. Despite\npromising results, there is a notable lack of a comprehensive evaluation of\nthese models language-to-code generation capabilities. Existing studies often\nfocus on specific tasks, model architectures, or learning paradigms, leading to\na fragmented understanding of the overall landscape. In this work, we present\nL2CEval, a systematic evaluation of the language-to-code generation\ncapabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing,\nmath reasoning and Python programming, analyzing the factors that potentially\naffect their performance, such as model size, pretraining data, instruction\ntuning, and different prompting methods. In addition to assessing model\nperformance, we measure confidence calibration for the models and conduct human\nevaluations of the output programs. This enables us to identify and analyze the\ntypical failure modes across various tasks and models. L2CEval offers a\ncomprehensive understanding of the capabilities and limitations of LLMs in\nlanguage-to-code generation. We also release the evaluation framework and all\nmodel outputs, hoping to lay the groundwork for further future research in this\ndomain.\n","authors":["Ansong Ni","Pengcheng Yin","Yilun Zhao","Martin Riddell","Troy Feng","Rui Shen","Stephen Yin","Ye Liu","Semih Yavuz","Caiming Xiong","Shafiq Joty","Yingbo Zhou","Dragomir Radev","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2309.17446v2.pdf","comment":"Project Website: https://l2c-eval.github.io/"},{"id":"http://arxiv.org/abs/2307.03354v2","updated":"2023-10-02T08:59:09Z","published":"2023-07-07T02:26:18Z","title":"Token-Level Serialized Output Training for Joint Streaming ASR and ST\n  Leveraging Textual Alignments","summary":"  In real-world applications, users often require both translations and\ntranscriptions of speech to enhance their comprehension, particularly in\nstreaming scenarios where incremental generation is necessary. This paper\nintroduces a streaming Transformer-Transducer that jointly generates automatic\nspeech recognition (ASR) and speech translation (ST) outputs using a single\ndecoder. To produce ASR and ST content effectively with minimal latency, we\npropose a joint token-level serialized output training method that interleaves\nsource and target words by leveraging an off-the-shelf textual aligner.\nExperiments in monolingual (it-en) and multilingual (\\{de,es,it\\}-en) settings\ndemonstrate that our approach achieves the best quality-latency balance. With\nan average ASR latency of 1s and ST latency of 1.3s, our model shows no\ndegradation or even improves output quality compared to separate ASR and ST\nmodels, yielding an average improvement of 1.1 WER and 0.4 BLEU in the\nmultilingual case.\n","authors":["Sara Papi","Peidong Wang","Junkun Chen","Jian Xue","Jinyu Li","Yashesh Gaur"],"pdf_url":"https://arxiv.org/pdf/2307.03354v2.pdf","comment":"Accepted at ASRU 2023"},{"id":"http://arxiv.org/abs/2309.16671v3","updated":"2023-10-02T07:12:53Z","published":"2023-09-28T17:59:56Z","title":"Demystifying CLIP Data","summary":"  Contrastive Language-Image Pre-training (CLIP) is an approach that has\nadvanced research and applications in computer vision, fueling modern\nrecognition systems and generative models. We believe that the main ingredient\nto the success of CLIP is its data and not the model architecture or\npre-training objective. However, CLIP only provides very limited information\nabout its data and how it has been collected, leading to works that aim to\nreproduce CLIP's data by filtering with its model parameters. In this work, we\nintend to reveal CLIP's data curation approach and in our pursuit of making it\nopen to the community introduce Metadata-Curated Language-Image Pre-training\n(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's\nconcepts) and yields a balanced subset over the metadata distribution. Our\nexperimental study rigorously isolates the model and training settings,\nconcentrating solely on data. MetaCLIP applied to CommonCrawl with 400M\nimage-text data pairs outperforms CLIP's data on multiple standard benchmarks.\nIn zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,\nsurpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining\nthe same training budget, attains 72.4%. Our observations hold across various\nmodel sizes, exemplified by ViT-H achieving 80.5%, without any\nbells-and-whistles. Curation code and training data distribution on metadata is\nmade available at https://github.com/facebookresearch/MetaCLIP.\n","authors":["Hu Xu","Saining Xie","Xiaoqing Ellen Tan","Po-Yao Huang","Russell Howes","Vasu Sharma","Shang-Wen Li","Gargi Ghosh","Luke Zettlemoyer","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2309.16671v3.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by\n  other authors"},{"id":"http://arxiv.org/abs/2307.03917v3","updated":"2023-10-02T06:57:19Z","published":"2023-07-08T06:47:58Z","title":"On decoder-only architecture for speech-to-text and large language model\n  integration","summary":"  Large language models (LLMs) have achieved remarkable success in the field of\nnatural language processing, enabling better human-computer interaction using\nnatural language. However, the seamless integration of speech signals into LLMs\nhas not been explored well. The \"decoder-only\" architecture has also not been\nwell studied for speech processing tasks. In this research, we introduce\nSpeech-LLaMA, a novel approach that effectively incorporates acoustic\ninformation into text-based large language models. Our method leverages\nConnectionist Temporal Classification and a simple audio encoder to map the\ncompressed acoustic features to the continuous semantic space of the LLM. In\naddition, we further probe the decoder-only architecture for speech-to-text\ntasks by training a smaller scale randomly initialized speech-LLaMA model from\nspeech-text paired data alone. We conduct experiments on multilingual\nspeech-to-text translation tasks and demonstrate a significant improvement over\nstrong baselines, highlighting the potential advantages of decoder-only models\nfor speech-to-text conversion.\n","authors":["Jian Wu","Yashesh Gaur","Zhuo Chen","Long Zhou","Yimeng Zhu","Tianrui Wang","Jinyu Li","Shujie Liu","Bo Ren","Linquan Liu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2307.03917v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11644v2","updated":"2023-10-02T06:12:30Z","published":"2023-06-20T16:14:25Z","title":"Textbooks Are All You Need","summary":"  We introduce phi-1, a new large language model for code, with significantly\nsmaller size than competing models: phi-1 is a Transformer-based model with\n1.3B parameters, trained for 4 days on 8 A100s, using a selection of ``textbook\nquality\" data from the web (6B tokens) and synthetically generated textbooks\nand exercises with GPT-3.5 (1B tokens). Despite this small scale, phi-1 attains\npass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP. It also displays\nsurprising emergent properties compared to phi-1-base, our model before our\nfinetuning stage on a dataset of coding exercises, and phi-1-small, a smaller\nmodel with 350M parameters trained with the same pipeline as phi-1 that still\nachieves 45% on HumanEval.\n","authors":["Suriya Gunasekar","Yi Zhang","Jyoti Aneja","Caio César Teodoro Mendes","Allie Del Giorno","Sivakanth Gopi","Mojan Javaheripi","Piero Kauffmann","Gustavo de Rosa","Olli Saarikivi","Adil Salim","Shital Shah","Harkirat Singh Behl","Xin Wang","Sébastien Bubeck","Ronen Eldan","Adam Tauman Kalai","Yin Tat Lee","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2306.11644v2.pdf","comment":"26 pages; changed color scheme of plot. fixed minor typos and added\n  couple clarifications"},{"id":"http://arxiv.org/abs/2309.17157v2","updated":"2023-10-02T05:44:25Z","published":"2023-09-29T11:46:07Z","title":"LatticeGen: A Cooperative Framework which Hides Generated Text in a\n  Lattice for Privacy-Aware Generation on Cloud","summary":"  In the current user-server interaction paradigm of prompted generation with\nlarge language models (LLM) on cloud, the server fully controls the generation\nprocess, which leaves zero options for users who want to keep the generated\ntext to themselves. We propose LatticeGen, a cooperative framework in which the\nserver still handles most of the computation while the user controls the\nsampling operation. The key idea is that the true generated sequence is mixed\nwith noise tokens by the user and hidden in a noised lattice. Considering\npotential attacks from a hypothetically malicious server and how the user can\ndefend against it, we propose the repeated beam-search attack and the mixing\nnoise scheme. In our experiments we apply LatticeGen to protect both prompt and\ngeneration. It is shown that while the noised lattice degrades generation\nquality, LatticeGen successfully protects the true generation to a remarkable\ndegree under strong attacks (more than 50% of the semantic remains hidden as\nmeasured by BERTScore).\n","authors":["Mengke Zhang","Tianxing He","Tianle Wang","Lu Mi","Fatemehsadat Mireshghallah","Binyi Chen","Hao Wang","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2309.17157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11394v2","updated":"2023-10-02T05:25:18Z","published":"2023-07-21T07:22:18Z","title":"MeetEval: A Toolkit for Computation of Word Error Rates for Meeting\n  Transcription Systems","summary":"  MeetEval is an open-source toolkit to evaluate all kinds of meeting\ntranscription systems. It provides a unified interface for the computation of\ncommonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER\nalong other WER definitions. We extend the cpWER computation by a temporal\nconstraint to ensure that only words are identified as correct when the\ntemporal alignment is plausible. This leads to a better quality of the matching\nof the hypothesis string to the reference string that more closely resembles\nthe actual transcription quality, and a system is penalized if it provides poor\ntime annotations. Since word-level timing information is often not available,\nwe present a way to approximate exact word-level timings from segment-level\ntimings (e.g., a sentence) and show that the approximation leads to a similar\nWER as a matching with exact word-level annotations. At the same time, the time\nconstraint leads to a speedup of the matching algorithm, which outweighs the\nadditional overhead caused by processing the time stamps.\n","authors":["Thilo von Neumann","Christoph Boeddeker","Marc Delcroix","Reinhold Haeb-Umbach"],"pdf_url":"https://arxiv.org/pdf/2307.11394v2.pdf","comment":"Presented at the CHiME7 workshop 2023"},{"id":"http://arxiv.org/abs/2305.11499v2","updated":"2023-10-02T03:59:04Z","published":"2023-05-19T08:02:52Z","title":"RCOT: Detecting and Rectifying Factual Inconsistency in Reasoning by\n  Reversing Chain-of-Thought","summary":"  Large language Models (LLMs) have achieved promising performance on\narithmetic reasoning tasks by incorporating step-by-step chain-of-thought (CoT)\nprompting. However, LLMs face challenges in maintaining factual consistency\nduring reasoning, exhibiting tendencies to condition overlooking, question\nmisinterpretation, and condition hallucination over given problems. Existing\nmethods use coarse-grained feedback (e.g., whether the answer is correct) to\nimprove factual consistency. In this work, we propose RCoT (Reversing\nChain-of-Thought), a novel method to improve LLMs' reasoning abilities by\nautomatically detecting and rectifying factual inconsistency in LLMs, generated\nsolutions. To detect factual inconsistency, RCoT first asks LLMs to reconstruct\nthe problem based on generated solutions. Then fine-grained comparisons between\nthe original problem and the reconstructed problem expose the factual\ninconsistency in the original solutions. To rectify the solution, RCoT\nformulates detected factual inconsistency into fine-grained feedback to guide\nLLMs in revising solutions. Experimental results demonstrate improvements of\nRCoT over standard CoT, Self-Consistency and Self-Refine across seven\narithmetic datasets. Moreover, we find that manually written fine-grained\nfeedback can dramatically improve LLMs' reasoning abilities (e.g., ChatGPT\nreaches 94.6% accuracy on GSM8K), encouraging the community to further explore\nthe fine-grained feedback generation methods.\n","authors":["Tianci Xue","Ziqi Wang","Zhenhailong Wang","Chi Han","Pengfei Yu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2305.11499v2.pdf","comment":"24 pages, 21 figures"},{"id":"http://arxiv.org/abs/2308.16463v2","updated":"2023-10-02T03:31:17Z","published":"2023-08-31T05:15:27Z","title":"Sparkles: Unlocking Chats Across Multiple Images for Multimodal\n  Instruction-Following Models","summary":"  Large language models exhibit enhanced zero-shot performance on various tasks\nwhen fine-tuned with instruction-following data. Multimodal\ninstruction-following models extend these capabilities by integrating both text\nand images. However, existing models such as MiniGPT-4 face challenges in\nmaintaining dialogue coherence in scenarios involving multiple images. A\nprimary reason is the lack of a specialized dataset for this critical\napplication. To bridge these gaps, we present SparklesChat, a multimodal\ninstruction-following model for open-ended dialogues across multiple images. To\nsupport the training, we introduce SparklesDialogue, the first\nmachine-generated dialogue dataset tailored for word-level interleaved\nmulti-image and text interactions. Furthermore, we construct SparklesEval, a\nGPT-assisted benchmark for quantitatively assessing a model's conversational\ncompetence across multiple images and dialogue turns. Our experiments validate\nthe effectiveness of SparklesChat in understanding and reasoning across\nmultiple images and dialogue turns. Specifically, SparklesChat outperformed\nMiniGPT-4 on established vision-and-language benchmarks, including the BISON\nbinary image selection task and the NLVR2 visual reasoning task. Moreover,\nSparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding\nMiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative\nevaluations further demonstrate SparklesChat's generality in handling\nreal-world applications. All resources are available at\nhttps://github.com/HYPJUDY/Sparkles.\n","authors":["Yupan Huang","Zaiqiao Meng","Fangyu Liu","Yixuan Su","Nigel Collier","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16463v2.pdf","comment":"Reduced main content to 9 pages; typos corrected"},{"id":"http://arxiv.org/abs/2309.17444v2","updated":"2023-10-02T01:46:44Z","published":"2023-09-29T17:54:46Z","title":"LLM-grounded Video Diffusion Models","summary":"  Text-conditioned diffusion models have emerged as a promising tool for neural\nvideo generation. However, current models still struggle with intricate\nspatiotemporal prompts and often generate restricted or incorrect motion (e.g.,\neven lacking the ability to be prompted for objects moving from left to right).\nTo address these limitations, we introduce LLM-grounded Video Diffusion (LVD).\nInstead of directly generating videos from the text inputs, LVD first leverages\na large language model (LLM) to generate dynamic scene layouts based on the\ntext inputs and subsequently uses the generated layouts to guide a diffusion\nmodel for video generation. We show that LLMs are able to understand complex\nspatiotemporal dynamics from text alone and generate layouts that align closely\nwith both the prompts and the object motion patterns typically observed in the\nreal world. We then propose to guide video diffusion models with these layouts\nby adjusting the attention maps. Our approach is training-free and can be\nintegrated into any video diffusion model that admits classifier guidance. Our\nresults demonstrate that LVD significantly outperforms its base video diffusion\nmodel and several strong baseline methods in faithfully generating videos with\nthe desired attributes and motion patterns.\n","authors":["Long Lian","Baifeng Shi","Adam Yala","Trevor Darrell","Boyi Li"],"pdf_url":"https://arxiv.org/pdf/2309.17444v2.pdf","comment":"Project Page: https://llm-grounded-video-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2309.13876v2","updated":"2023-10-02T01:10:01Z","published":"2023-09-25T05:01:34Z","title":"Reproducing Whisper-Style Training Using an Open-Source Toolkit and\n  Publicly Available Data","summary":"  Pre-training speech models on large volumes of data has achieved remarkable\nsuccess. OpenAI Whisper is a multilingual multitask model trained on 680k hours\nof supervised speech data. It generalizes well to various speech recognition\nand translation benchmarks even in a zero-shot setup. However, the full\npipeline for developing such models (from data collection to training) is not\npublicly accessible, which makes it difficult for researchers to further\nimprove its performance and address training-related issues such as efficiency,\nrobustness, fairness, and bias. This work presents an Open Whisper-style Speech\nModel (OWSM), which reproduces Whisper-style training using an open-source\ntoolkit and publicly available data. OWSM even supports more translation\ndirections and can be more efficient to train. We will publicly release all\nscripts used for data preparation, training, inference, and scoring as well as\npre-trained models and training logs to promote open science.\n","authors":["Yifan Peng","Jinchuan Tian","Brian Yan","Dan Berrebbi","Xuankai Chang","Xinjian Li","Jiatong Shi","Siddhant Arora","William Chen","Roshan Sharma","Wangyou Zhang","Yui Sudo","Muhammad Shakeel","Jee-weon Jung","Soumi Maiti","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2309.13876v2.pdf","comment":"Accepted at ASRU 2023"},{"id":"http://arxiv.org/abs/2310.01693v1","updated":"2023-10-02T23:16:25Z","published":"2023-10-02T23:16:25Z","title":"Closing the Curious Case of Neural Text Degeneration","summary":"  Despite their ubiquity in language generation, it remains unknown why\ntruncation sampling heuristics like nucleus sampling are so effective. We\nprovide a theoretical explanation for the effectiveness of the truncation\nsampling by proving that truncation methods that discard tokens below some\nprobability threshold (the most common type of truncation) can guarantee that\nall sampled tokens have nonzero true probability. However, thresholds are a\ncoarse heuristic, and necessarily discard some tokens with nonzero true\nprobability as well. In pursuit of a more precise sampling strategy, we show\nthat we can leverage a known source of model errors, the softmax bottleneck, to\nprove that certain tokens have nonzero true probability, without relying on a\nthreshold. Based on our findings, we develop an experimental truncation\nstrategy and the present pilot studies demonstrating the promise of this type\nof algorithm. Our evaluations show that our method outperforms its\nthreshold-based counterparts under automatic and human evaluation metrics for\nlow-entropy (i.e., close to greedy) open-ended text generation. Our theoretical\nfindings and pilot experiments provide both insight into why truncation\nsampling works, and make progress toward more expressive sampling algorithms\nthat better surface the generative capabilities of large language models.\n","authors":["Matthew Finlayson","John Hewitt","Alexander Koller","Swabha Swayamdipta","Ashish Sabharwal"],"pdf_url":"https://arxiv.org/pdf/2310.01693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01691v1","updated":"2023-10-02T23:12:21Z","published":"2023-10-02T23:12:21Z","title":"Zero-Shot Continuous Prompt Transfer: Generalizing Task Semantics Across\n  Language Models","summary":"  Prompt tuning in natural language processing (NLP) has become an increasingly\npopular method for adapting large language models to specific tasks. However,\nthe transferability of these prompts, especially continuous prompts, between\ndifferent models remains a challenge. In this work, we propose a zero-shot\ncontinuous prompt transfer method, where source prompts are encoded into\nrelative space and the corresponding target prompts are searched for\ntransferring to target models. Experimental results confirm the effectiveness\nof our method, showing that 'task semantics' in continuous prompts can be\ngeneralized across various language models. Moreover, we find that combining\n'task semantics' from multiple source models can further enhance the\ngeneralizability of transfer.\n","authors":["Zijun Wu","Yongkang Wu","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2310.01691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01688v1","updated":"2023-10-02T23:03:30Z","published":"2023-10-02T23:03:30Z","title":"One model to rule them all ? Towards End-to-End Joint Speaker\n  Diarization and Speech Recognition","summary":"  This paper presents a novel framework for joint speaker diarization (SD) and\nautomatic speech recognition (ASR), named SLIDAR (sliding-window\ndiarization-augmented recognition). SLIDAR can process arbitrary length inputs\nand can handle any number of speakers, effectively solving ``who spoke what,\nwhen'' concurrently. SLIDAR leverages a sliding window approach and consists of\nan end-to-end diarization-augmented speech transcription (E2E DAST) model which\nprovides, locally, for each window: transcripts, diarization and speaker\nembeddings. The E2E DAST model is based on an encoder-decoder architecture and\nleverages recent techniques such as serialized output training and\n``Whisper-style\" prompting. The local outputs are then combined to get the\nfinal SD+ASR result by clustering the speaker embeddings to get global speaker\nidentities. Experiments performed on monaural recordings from the AMI corpus\nconfirm the effectiveness of the method in both close-talk and far-field speech\nscenarios.\n","authors":["Samuele Cornell","Jee-weon Jung","Shinji Watanabe","Stefano Squartini"],"pdf_url":"https://arxiv.org/pdf/2310.01688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06945v2","updated":"2023-10-02T22:38:42Z","published":"2023-07-13T17:59:21Z","title":"In-context Autoencoder for Context Compression in a Large Language Model","summary":"  We propose the In-context Autoencoder (ICAE), leveraging the power of a large\nlanguage models (LLM) to compress a long context into short compact memory\nslots that can be directly conditioned on by the LLM for various purposes. ICAE\nis first pretrained using both autoencoding and language modeling objectives on\nmassive text data, enabling it to generate memory slots that accurately and\ncomprehensively represent the original context; Then, it is fine-tuned on\ninstruction data for producing desirable responses to various prompts.\nExperiments demonstrate that our lightweight ICAE, introducing fewer than 1%\nadditional parameters, effectively achieves 4X context compression based on\nLlama, offering advantages in both improved latency and GPU memory cost during\ninference, and showing an interesting insight in memorization as well as\npotential for scalability. These promising results imply a novel perspective on\nthe connection between working memory in cognitive science and representation\nlearning in LLMs, revealing ICAE's significant implications in addressing the\nlong context problem and suggesting further research in LLM context management.\nOur data, code and model are released at https://github.com/getao/icae.\n","authors":["Tao Ge","Jing Hu","Lei Wang","Xun Wang","Si-Qing Chen","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.06945v2.pdf","comment":"v2 (19 pages) with the code, data and model released"},{"id":"http://arxiv.org/abs/2304.04736v3","updated":"2023-10-02T20:56:35Z","published":"2023-04-10T17:47:39Z","title":"On the Possibilities of AI-Generated Text Detection","summary":"  Our work addresses the critical issue of distinguishing text generated by\nLarge Language Models (LLMs) from human-produced text, a task essential for\nnumerous applications. Despite ongoing debate about the feasibility of such\ndifferentiation, we present evidence supporting its consistent achievability,\nexcept when human and machine text distributions are indistinguishable across\ntheir entire support. Drawing from information theory, we argue that as\nmachine-generated text approximates human-like quality, the sample size needed\nfor detection increases. We establish precise sample complexity bounds for\ndetecting AI-generated text, laying groundwork for future research aimed at\ndeveloping advanced, multi-sample detectors. Our empirical evaluations across\nmultiple datasets (Xsum, Squad, IMDb, and Kaggle FakeNews) confirm the\nviability of enhanced detection methods. We test various state-of-the-art text\ngenerators, including GPT-2, GPT-3.5-Turbo, Llama, Llama-2-13B-Chat-HF, and\nLlama-2-70B-Chat-HF, against detectors, including oBERTa-Large/Base-Detector,\nGPTZero. Our findings align with OpenAI's empirical data related to sequence\nlength, marking the first theoretical substantiation for these observations.\n","authors":["Souradip Chakraborty","Amrit Singh Bedi","Sicheng Zhu","Bang An","Dinesh Manocha","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2304.04736v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07467v2","updated":"2023-10-02T20:48:41Z","published":"2022-10-14T02:34:12Z","title":"Query Rewriting for Effective Misinformation Discovery","summary":"  We propose a novel system to help fact-checkers formulate search queries for\nknown misinformation claims and effectively search across multiple social media\nplatforms. We introduce an adaptable rewriting strategy, where editing actions\nfor queries containing claims (e.g., swap a word with its synonym; change verb\ntense into present simple) are automatically learned through offline\nreinforcement learning. Our model uses a decision transformer to learn a\nsequence of editing actions that maximizes query retrieval metrics such as mean\naverage precision. We conduct a series of experiments showing that our query\nrewriting system achieves a relative increase in the effectiveness of the\nqueries of up to 42%, while producing editing action sequences that are human\ninterpretable.\n","authors":["Ashkan Kazemi","Artem Abzaliev","Naihao Deng","Rui Hou","Scott A. Hale","Verónica Pérez-Rosas","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2210.07467v2.pdf","comment":"AACL 2023 (long paper)"},{"id":"http://arxiv.org/abs/2310.01627v1","updated":"2023-10-02T20:45:41Z","published":"2023-10-02T20:45:41Z","title":"VAL: Interactive Task Learning with GPT Dialog Parsing","summary":"  Reinforcement learning often requires millions of examples to produce static,\nblack-box models. In contrast, interactive task learning (ITL) emphasizes\nincremental knowledge acquisition from limited instruction provided by humans\nin modalities such as natural language. However, in practice, ITL systems often\nsuffers from brittle, error-prone language parsing. Large language models\n(LLMs) are resistant to brittleness but are not interpretable and cannot learn\nincrementally. We present VAL, an ITL system with a new philosophy for\nLLM/symbolic integration. By using LLMs only for specific tasks -- such as\npredicate and argument selection -- within an algorithmic framework, VAL reaps\nthe benefits of LLMs to support interactive learning of hierarchical task\nknowledge from natural language. Acquired knowledge is human interpretable and\ngeneralizes to support execution of novel tasks without additional training. We\nstudied users' interactions with VAL in a video game setting, finding that most\nusers could successfully teach VAL using language they felt was natural.\n","authors":["Lane Lawley","Christopher J. MacLellan"],"pdf_url":"https://arxiv.org/pdf/2310.01627v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2309.14006v2","updated":"2023-10-02T20:08:36Z","published":"2023-09-25T10:16:30Z","title":"Multiple evolutionary pressures shape identical consonant avoidance in\n  the world's languages","summary":"  Languages disfavor word forms containing sequences of similar or identical\nconsonants, due to the biomechanical and cognitive difficulties posed by\npatterns of this sort. However, the specific evolutionary processes responsible\nfor this phenomenon are not fully understood. Words containing sequences of\nidentical consonants may be more likely to arise than those without; processes\nof word form mutation may be more likely to remove than create sequences of\nidentical consonants in word forms; finally, words containing identical\nconsonants may die out more frequently than those without. Phylogenetic\nanalyses of the evolution of homologous word forms indicate that words with\nidentical consonants arise less frequently than those without, and processes\nwhich mutate word forms are more likely to remove sequences of identical\nconsonants than introduce them. However, words with identical consonants do not\ndie out more frequently than those without. Further analyses reveal that forms\nwith identical consonants are replaced in basic meaning functions more\nfrequently than words without. Taken together, results suggest that the under\nrepresentation of sequences of identical consonants is overwhelmingly a\nbyproduct of constraints on word form coinage, though processes related to word\nusage also serve to ensure that such patterns are infrequent in more salient\nvocabulary items. These findings clarify previously unknown aspects of\nprocesses of lexical evolution and competition that take place during language\nchange, optimizing communicative systems.\n","authors":["Chundra A. Cathcart"],"pdf_url":"https://arxiv.org/pdf/2309.14006v2.pdf","comment":"33 pp"},{"id":"http://arxiv.org/abs/2310.01603v1","updated":"2023-10-02T19:54:30Z","published":"2023-10-02T19:54:30Z","title":"A Review of Digital Learning Environments for Teaching Natural Language\n  Processing in K-12 Education","summary":"  Natural Language Processing (NLP) plays a significant role in our daily lives\nand has become an essential part of Artificial Intelligence (AI) education in\nK-12. As children grow up with NLP-powered applications, it is crucial to\nintroduce NLP concepts to them, fostering their understanding of language\nprocessing, language generation, and ethical implications of AI and NLP. This\npaper presents a comprehensive review of digital learning environments for\nteaching NLP in K-12. Specifically, it explores existing digital learning\ntools, discusses how they support specific NLP tasks and procedures, and\ninvestigates their explainability and evaluation results in educational\ncontexts. By examining the strengths and limitations of these tools, this\nliterature review sheds light on the current state of NLP learning tools in\nK-12 education. It aims to guide future research efforts to refine existing\ntools, develop new ones, and explore more effective and inclusive strategies\nfor integrating NLP into K-12 educational contexts.\n","authors":["Xiaoyi Tian","Kristy Elizabeth Boyer"],"pdf_url":"https://arxiv.org/pdf/2310.01603v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2310.01568v1","updated":"2023-10-02T19:03:11Z","published":"2023-10-02T19:03:11Z","title":"Defending Against Authorship Identification Attacks","summary":"  Authorship identification has proven unsettlingly effective in inferring the\nidentity of the author of an unsigned document, even when sensitive personal\ninformation has been carefully omitted. In the digital era, individuals leave a\nlasting digital footprint through their written content, whether it is posted\non social media, stored on their employer's computers, or located elsewhere.\nWhen individuals need to communicate publicly yet wish to remain anonymous,\nthere is little available to protect them from unwanted authorship\nidentification. This unprecedented threat to privacy is evident in scenarios\nsuch as whistle-blowing. Proposed defenses against authorship identification\nattacks primarily aim to obfuscate one's writing style, thereby making it\nunlinkable to their pre-existing writing, while concurrently preserving the\noriginal meaning and grammatical integrity. The presented work offers a\ncomprehensive review of the advancements in this research area spanning over\nthe past two decades and beyond. It emphasizes the methodological frameworks of\nmodification and generation-based strategies devised to evade authorship\nidentification attacks, highlighting joint efforts from the differential\nprivacy community. Limitations of current research are discussed, with a\nspotlight on open challenges and potential research avenues.\n","authors":["Haining Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01558v1","updated":"2023-10-02T18:52:35Z","published":"2023-10-02T18:52:35Z","title":"Making Retrieval-Augmented Language Models Robust to Irrelevant Context","summary":"  Retrieval-augmented language models (RALMs) hold promise to produce language\nunderstanding systems that are are factual, efficient, and up-to-date. An\nimportant desideratum of RALMs, is that retrieved information helps model\nperformance when it is relevant, and does not harm performance when it is not.\nThis is particularly important in multi-hop reasoning scenarios, where misuse\nof irrelevant evidence can lead to cascading errors. However, recent work has\nshown that retrieval augmentation can sometimes have a negative effect on\nperformance. In this work, we present a thorough analysis on five open-domain\nquestion answering benchmarks, characterizing cases when retrieval reduces\naccuracy. We then propose two methods to mitigate this issue. First, a simple\nbaseline that filters out retrieved passages that do not entail question-answer\npairs according to a natural language inference (NLI) model. This is effective\nin preventing performance reduction, but at a cost of also discarding relevant\npassages. Thus, we propose a method for automatically generating data to\nfine-tune the language model to properly leverage retrieved passages, using a\nmix of relevant and irrelevant contexts at training time. We empirically show\nthat even 1,000 examples suffice to train the model to be robust to irrelevant\ncontexts while maintaining high performance on examples with relevant ones.\n","authors":["Ori Yoran","Tomer Wolfson","Ori Ram","Jonathan Berant"],"pdf_url":"https://arxiv.org/pdf/2310.01558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01415v1","updated":"2023-10-02T17:59:57Z","published":"2023-10-02T17:59:57Z","title":"GPT-Driver: Learning to Drive with GPT","summary":"  We present a simple yet effective approach that can transform the OpenAI\nGPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion\nplanning is a core challenge in autonomous driving, aiming to plan a driving\ntrajectory that is safe and comfortable. Existing motion planners predominantly\nleverage heuristic methods to forecast driving trajectories, yet these\napproaches demonstrate insufficient generalization capabilities in the face of\nnovel and unseen driving scenarios. In this paper, we propose a novel approach\nto motion planning that capitalizes on the strong reasoning capabilities and\ngeneralization potential inherent to Large Language Models (LLMs). The\nfundamental insight of our approach is the reformulation of motion planning as\na language modeling problem, a perspective not previously explored.\nSpecifically, we represent the planner inputs and outputs as language tokens,\nand leverage the LLM to generate driving trajectories through a language\ndescription of coordinate positions. Furthermore, we propose a novel\nprompting-reasoning-finetuning strategy to stimulate the numerical reasoning\npotential of the LLM. With this strategy, the LLM can describe highly precise\ntrajectory coordinates and also its internal decision-making process in natural\nlanguage. We evaluate our approach on the large-scale nuScenes dataset, and\nextensive experiments substantiate the effectiveness, generalization ability,\nand interpretability of our GPT-based motion planner. Code will be released\nupon acceptance.\n","authors":["Jiageng Mao","Yuxi Qian","Hang Zhao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01387v1","updated":"2023-10-02T17:47:10Z","published":"2023-10-02T17:47:10Z","title":"It's MBR All the Way Down: Modern Generation Techniques Through the Lens\n  of Minimum Bayes Risk","summary":"  Minimum Bayes Risk (MBR) decoding is a method for choosing the outputs of a\nmachine learning system based not on the output with the highest probability,\nbut the output with the lowest risk (expected error) among multiple candidates.\nIt is a simple but powerful method: for an additional cost at inference time,\nMBR provides reliable several-point improvements across metrics for a wide\nvariety of tasks without any additional data or training. Despite this, MBR is\nnot frequently applied in NLP works, and knowledge of the method itself is\nlimited. We first provide an introduction to the method and the recent\nliterature. We show that several recent methods that do not reference MBR can\nbe written as special cases of MBR; this reformulation provides additional\ntheoretical justification for the performance of these methods, explaining some\nresults that were previously only empirical. We provide theoretical and\nempirical results about the effectiveness of various MBR variants and make\nconcrete recommendations for the application of MBR in NLP models, including\nfuture directions in this area.\n","authors":["Amanda Bertsch","Alex Xie","Graham Neubig","Matthew R. Gormley"],"pdf_url":"https://arxiv.org/pdf/2310.01387v1.pdf","comment":"Under submission"},{"id":"http://arxiv.org/abs/2310.01386v1","updated":"2023-10-02T17:46:09Z","published":"2023-10-02T17:46:09Z","title":"Who is ChatGPT? Benchmarking LLMs' Psychological Portrayal Using\n  PsychoBench","summary":"  Large Language Models (LLMs) have recently showcased their remarkable\ncapacities, not only in natural language processing tasks but also across\ndiverse domains such as clinical medicine, legal consultation, and education.\nLLMs become more than mere applications, evolving into assistants capable of\naddressing diverse user requests. This narrows the distinction between human\nbeings and artificial intelligence agents, raising intriguing questions\nregarding the potential manifestation of personalities, temperaments, and\nemotions within LLMs. In this paper, we propose a framework, PsychoBench, for\nevaluating diverse psychological aspects of LLMs. Comprising thirteen scales\ncommonly used in clinical psychology, PsychoBench further classifies these\nscales into four distinct categories: personality traits, interpersonal\nrelationships, motivational tests, and emotional abilities. Our study examines\nfive popular models, namely \\texttt{text-davinci-003}, ChatGPT, GPT-4,\nLLaMA-2-7b, and LLaMA-2-13b. Additionally, we employ a jailbreak approach to\nbypass the safety alignment protocols and test the intrinsic natures of LLMs.\nWe have made PsychoBench openly accessible via\n\\url{https://github.com/CUHK-ARISE/PsychoBench}.\n","authors":["Jen-tse Huang","Wenxuan Wang","Eric John Li","Man Ho Lam","Shujie Ren","Youliang Yuan","Wenxiang Jiao","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2310.01386v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2310.01382v1","updated":"2023-10-02T17:42:37Z","published":"2023-10-02T17:42:37Z","title":"Compressing LLMs: The Truth is Rarely Pure and Never Simple","summary":"  Despite their remarkable achievements, modern Large Language Models (LLMs)\nencounter exorbitant computational and memory footprints. Recently, several\nworks have shown significant success in training-free and data-free compression\n(pruning and quantization) of LLMs achieving 50-60% sparsity and reducing the\nbit-width down to 3 or 4 bits per weight, with negligible perplexity\ndegradation over the uncompressed baseline. As recent research efforts are\nfocused on developing increasingly sophisticated compression methods, our work\ntakes a step back, and re-evaluates the effectiveness of existing SoTA\ncompression methods, which rely on a fairly simple and widely questioned\nmetric, perplexity (even for dense LLMs). We introduce Knowledge-Intensive\nCompressed LLM BenchmarK (LLM-KICK), a collection of carefully-curated tasks to\nre-define the evaluation protocol for compressed LLMs, which have significant\nalignment with their dense counterparts, and perplexity fail to capture subtle\nchange in their true capabilities. LLM-KICK unveils many favorable merits and\nunfortunate plights of current SoTA compression methods: all pruning methods\nsuffer significant performance degradation, sometimes at trivial sparsity\nratios (e.g., 25-30%), and fail for N:M sparsity on knowledge-intensive tasks;\ncurrent quantization methods are more successful than pruning; yet, pruned LLMs\neven at $\\geq 50$% sparsity are robust in-context retrieval and summarization\nsystems; among others. LLM-KICK is designed to holistically access compressed\nLLMs' ability for language understanding, reasoning, generation, in-context\nretrieval, in-context summarization, etc. We hope our study can foster the\ndevelopment of better LLM compression methods. All our related codes are planed\nto be open-sourced.\n","authors":["Ajay Jaiswal","Zhe Gan","Xianzhi Du","Bowen Zhang","Zhangyang Wang","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2310.01382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01381v1","updated":"2023-10-02T17:42:22Z","published":"2023-10-02T17:42:22Z","title":"DiffAR: Denoising Diffusion Autoregressive Model for Raw Speech Waveform\n  Generation","summary":"  Diffusion models have recently been shown to be relevant for high-quality\nspeech generation. Most work has been focused on generating spectrograms, and\nas such, they further require a subsequent model to convert the spectrogram to\na waveform (i.e., a vocoder). This work proposes a diffusion probabilistic\nend-to-end model for generating a raw speech waveform. The proposed model is\nautoregressive, generating overlapping frames sequentially, where each frame is\nconditioned on a portion of the previously generated one. Hence, our model can\neffectively synthesize an unlimited speech duration while preserving\nhigh-fidelity synthesis and temporal coherence. We implemented the proposed\nmodel for unconditional and conditional speech generation, where the latter can\nbe driven by an input sequence of phonemes, amplitudes, and pitch values.\nWorking on the waveform directly has some empirical advantages. Specifically,\nit allows the creation of local acoustic behaviors, like vocal fry, which makes\nthe overall waveform sounds more natural. Furthermore, the proposed diffusion\nmodel is stochastic and not deterministic; therefore, each inference generates\na slightly different waveform variation, enabling abundance of valid\nrealizations. Experiments show that the proposed model generates speech with\nsuperior quality compared with other state-of-the-art neural speech generation\nsystems.\n","authors":["Roi Benita","Michael Elad","Joseph Keshet"],"pdf_url":"https://arxiv.org/pdf/2310.01381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01377v1","updated":"2023-10-02T17:40:01Z","published":"2023-10-02T17:40:01Z","title":"UltraFeedback: Boosting Language Models with High-quality Feedback","summary":"  Reinforcement learning from human feedback (RLHF) has become a pivot\ntechnique in aligning large language models (LLMs) with human preferences. In\nRLHF practice, preference data plays a crucial role in bridging human\nproclivity and LLMs. However, the scarcity of diverse, naturalistic datasets of\nhuman preferences on LLM outputs at scale poses a great challenge to RLHF as\nwell as feedback learning research within the open-source community. Current\npreference datasets, either proprietary or limited in size and prompt variety,\nresult in limited RLHF adoption in open-source models and hinder further\nexploration. In this study, we propose ULTRAFEEDBACK, a large-scale,\nhigh-quality, and diversified preference dataset designed to overcome these\nlimitations and foster RLHF development. To create ULTRAFEEDBACK, we compile a\ndiverse array of instructions and models from multiple sources to produce\ncomparative data. We meticulously devise annotation instructions and employ\nGPT-4 to offer detailed feedback in both numerical and textual forms.\nULTRAFEEDBACK establishes a reproducible and expandable preference data\nconstruction pipeline, serving as a solid foundation for future RLHF and\nfeedback learning research. Utilizing ULTRAFEEDBACK, we train various models to\ndemonstrate its effectiveness, including the reward model UltraRM, chat\nlanguage model UltraLM-13B-PPO, and critique model UltraCM. Experimental\nresults indicate that our models outperform existing open-source models,\nachieving top performance across multiple benchmarks. Our data and models are\navailable at https://github.com/thunlp/UltraFeedback.\n","authors":["Ganqu Cui","Lifan Yuan","Ning Ding","Guanming Yao","Wei Zhu","Yuan Ni","Guotong Xie","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2310.01377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01361v1","updated":"2023-10-02T17:23:48Z","published":"2023-10-02T17:23:48Z","title":"GenSim: Generating Robotic Simulation Tasks via Large Language Models","summary":"  Collecting large amounts of real-world interaction data to train general\nrobotic policies is often prohibitively expensive, thus motivating the use of\nsimulation data. However, existing methods for data generation have generally\nfocused on scene-level diversity (e.g., object instances and poses) rather than\ntask-level diversity, due to the human effort required to come up with and\nverify novel tasks. This has made it challenging for policies trained on\nsimulation data to demonstrate significant task-level generalization. In this\npaper, we propose to automatically generate rich simulation environments and\nexpert demonstrations by exploiting a large language models' (LLM) grounding\nand coding ability. Our approach, dubbed GenSim, has two modes: goal-directed\ngeneration, wherein a target task is given to the LLM and the LLM proposes a\ntask curriculum to solve the target task, and exploratory generation, wherein\nthe LLM bootstraps from previous tasks and iteratively proposes novel tasks\nthat would be helpful in solving more complex tasks. We use GPT4 to expand the\nexisting benchmark by ten times to over 100 tasks, on which we conduct\nsupervised finetuning and evaluate several LLMs including finetuned GPTs and\nCode Llama on code generation for robotic simulation tasks. Furthermore, we\nobserve that LLMs-generated simulation programs can enhance task-level\ngeneralization significantly when used for multitask policy training. We\nfurther find that with minimal sim-to-real adaptation, the multitask policies\npretrained on GPT4-generated simulation tasks exhibit stronger transfer to\nunseen long-horizon tasks in the real world and outperform baselines by 25%.\nSee the project website (https://liruiw.github.io/gensim) for code, demos, and\nvideos.\n","authors":["Lirui Wang","Yiyang Ling","Zhecheng Yuan","Mohit Shridhar","Chen Bao","Yuzhe Qin","Bailin Wang","Huazhe Xu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01361v1.pdf","comment":"See our project website (https://liruiw.github.io/gensim), demo\n  (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code\n  (https://github.com/liruiw/GenSim) for visualizations and open-source models\n  and datasets"},{"id":"http://arxiv.org/abs/2310.01352v1","updated":"2023-10-02T17:16:26Z","published":"2023-10-02T17:16:26Z","title":"RA-DIT: Retrieval-Augmented Dual Instruction Tuning","summary":"  Retrieval-augmented language models (RALMs) improve performance by accessing\nlong-tail and up-to-date knowledge from external data stores, but are\nchallenging to build. Existing approaches require either expensive\nretrieval-specific modifications to LM pre-training or use post-hoc integration\nof the data store that leads to suboptimal performance. We introduce\nRetrieval-Augmented Dual Instruction Tuning (RA-DIT), a lightweight fine-tuning\nmethodology that provides a third option by retrofitting any LLM with retrieval\ncapabilities. Our approach operates in two distinct fine-tuning steps: (1) one\nupdates a pre-trained LM to better use retrieved information, while (2) the\nother updates the retriever to return more relevant results, as preferred by\nthe LM. By fine-tuning over tasks that require both knowledge utilization and\ncontextual awareness, we demonstrate that each stage yields significant\nperformance improvements, and using both leads to additional gains. Our best\nmodel, RA-DIT 65B, achieves state-of-the-art performance across a range of\nknowledge-intensive zero- and few-shot learning benchmarks, significantly\noutperforming existing in-context RALM approaches by up to +8.9% in 0-shot\nsetting and +1.4% in 5-shot setting on average.\n","authors":["Xi Victoria Lin","Xilun Chen","Mingda Chen","Weijia Shi","Maria Lomeli","Rich James","Pedro Rodriguez","Jacob Kahn","Gergely Szilvasy","Mike Lewis","Luke Zettlemoyer","Scott Yih"],"pdf_url":"https://arxiv.org/pdf/2310.01352v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2310.01339v1","updated":"2023-10-02T17:02:57Z","published":"2023-10-02T17:02:57Z","title":"Improving Dialogue Management: Quality Datasets vs Models","summary":"  Task-oriented dialogue systems (TODS) have become crucial for users to\ninteract with machines and computers using natural language. One of its key\ncomponents is the dialogue manager, which guides the conversation towards a\ngood goal for the user by providing the best possible response. Previous works\nhave proposed rule-based systems (RBS), reinforcement learning (RL), and\nsupervised learning (SL) as solutions for the correct dialogue management; in\nother words, select the best response given input by the user. However, this\nwork argues that the leading cause of DMs not achieving maximum performance\nresides in the quality of the datasets rather than the models employed thus\nfar; this means that dataset errors, like mislabeling, originate a large\npercentage of failures in dialogue management. We studied the main errors in\nthe most widely used datasets, Multiwoz 2.1 and SGD, to demonstrate this\nhypothesis. To do this, we have designed a synthetic dialogue generator to\nfully control the amount and type of errors introduced in the dataset. Using\nthis generator, we demonstrated that errors in the datasets contribute\nproportionally to the performance of the models\n","authors":["Miguel Ángel Medina-Ramírez","Cayetano Guerra-Artal","Mario Hernández-Tejera"],"pdf_url":"https://arxiv.org/pdf/2310.01339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01469v1","updated":"2023-10-02T17:01:56Z","published":"2023-10-02T17:01:56Z","title":"LLM Lies: Hallucinations are not Bugs, but Features as Adversarial\n  Examples","summary":"  Large Language Models (LLMs), including GPT-3.5, LLaMA, and PaLM, seem to be\nknowledgeable and able to adapt to many tasks. However, we still can not\ncompletely trust their answer, since LLMs suffer from\nhallucination--fabricating non-existent facts to cheat users without\nperception. And the reasons for their existence and pervasiveness remain\nunclear. In this paper, we demonstrate that non-sense prompts composed of\nrandom tokens can also elicit the LLMs to respond with hallucinations. This\nphenomenon forces us to revisit that hallucination may be another view of\nadversarial examples, and it shares similar features with conventional\nadversarial examples as the basic feature of LLMs. Therefore, we formalize an\nautomatic hallucination triggering method as the hallucination attack in an\nadversarial way. Finally, we explore basic feature of attacked adversarial\nprompts and propose a simple yet effective defense strategy. Our code is\nreleased on GitHub.\n","authors":["Jia-Yu Yao","Kun-Peng Ning","Zhen-Hui Liu","Mu-Nan Ning","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01468v1","updated":"2023-10-02T16:55:37Z","published":"2023-10-02T16:55:37Z","title":"The Entity-Deduction Arena: A playground for probing the conversational\n  reasoning and planning capabilities of LLMs","summary":"  Large language models (LLMs) are currently effective at answering questions\nthat are clearly asked. However, when faced with ambiguous queries they can act\nunpredictably and produce incorrect outputs. This underscores the need for the\ndevelopment of intelligent agents capable of asking clarification questions to\nresolve ambiguities effectively. This capability requires complex\nunderstanding, state tracking, reasoning and planning over multiple\nconversational turns. However, directly measuring this can be challenging. In\nthis paper, we offer a surrogate problem which assesses an LLMs's capability to\ndeduce an entity unknown to itself, but revealed to a judge, by asking the\njudge a series of queries. This \\textit{entity-deducing game} can serve as an\nevaluation framework to probe the conversational reasoning and planning\ncapabilities of language models. We systematically evaluate various LLMs and\ndiscover significant differences in their performance on this task. We find\nthat strong LLMs like GPT-4 outperform human players by a large margin. We\nfurther employ Behavior Cloning (BC) to examine whether a weaker model is\ncapable of imitating a stronger model and generalizing to data or domains,\nusing only the demonstrations from a stronger model. We finally propose to use\nReinforcement Learning to enhance reasoning and planning capacity of Vicuna\nmodels through episodes of game playing, which lead to significant performance\nimprovement. We hope that this problem offers insights into how autonomous\nagents could be trained to behave more intelligently in ambiguous\ncircumstances.\n","authors":["Yizhe Zhang","Jiarui Lu","Navdeep Jaitly"],"pdf_url":"https://arxiv.org/pdf/2310.01468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01334v1","updated":"2023-10-02T16:51:32Z","published":"2023-10-02T16:51:32Z","title":"Merge, Then Compress: Demystify Efficient SMoE with Hints from Its\n  Routing Policy","summary":"  Sparsely activated Mixture-of-Experts (SMoE) has shown promise to scale up\nthe learning capacity of neural networks, however, they have issues like (a)\nHigh Memory Usage, due to duplication of the network layers into multiple\ncopies as experts; and (b) Redundancy in Experts, as common learning-based\nrouting policies suffer from representational collapse. Therefore, vanilla SMoE\nmodels are memory inefficient and non-scalable, especially for\nresource-constrained downstream scenarios. In this paper, we ask: Can we craft\na compact SMoE model by consolidating expert information? What is the best\nrecipe to merge multiple experts into fewer but more knowledgeable experts? Our\npilot investigation reveals that conventional model merging methods fail to be\neffective in such expert merging for SMoE. The potential reasons are: (1)\nredundant information overshadows critical experts; (2) appropriate neuron\npermutation for each expert is missing to bring all of them in alignment. To\naddress this, we propose M-SMoE, which leverages routing statistics to guide\nexpert merging. Specifically, it starts with neuron permutation alignment for\nexperts; then, dominant experts and their \"group members\" are formed; lastly,\nevery expert group is merged into a single expert by utilizing each expert's\nactivation frequency as their weight for merging, thus diminishing the impact\nof insignificant experts. Moreover, we observed that our proposed merging\npromotes a low dimensionality in the merged expert's weight space, naturally\npaving the way for additional compression. Hence, our final method, MC-SMoE\n(i.e., Merge, then Compress SMoE), further decomposes the merged experts into\nlow-rank and structural sparse alternatives. Extensive experiments across 8\nbenchmarks validate the effectiveness of MC-SMoE. For instance, our MC-SMoE\nachieves up to 80% memory and a 20% FLOPs reduction, with virtually no loss in\nperformance.\n","authors":["Pingzhi Li","Zhenyu Zhang","Prateek Yadav","Yi-Lin Sung","Yu Cheng","Mohit Bansal","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01334v1.pdf","comment":"17 pages, 5 figures, 11 tables"},{"id":"http://arxiv.org/abs/2310.01329v1","updated":"2023-10-02T16:48:47Z","published":"2023-10-02T16:48:47Z","title":"BTR: Binary Token Representations for Efficient Retrieval Augmented\n  Language Models","summary":"  Retrieval augmentation addresses many critical problems in large language\nmodels such as hallucination, staleness, and privacy leaks. However, running\nretrieval-augmented language models (LMs) is slow and difficult to scale due to\nprocessing large amounts of retrieved text. We introduce binary token\nrepresentations (BTR), which use 1-bit vectors to precompute every token in\npassages, significantly reducing computation during inference. Despite the\npotential loss of accuracy, our new calibration techniques and training\nobjectives restore performance. Combined with offline and runtime compression,\nthis only requires 127GB of disk space for encoding 3 billion tokens in\nWikipedia. Our experiments show that on five knowledge-intensive NLP tasks, BTR\naccelerates state-of-the-art inference by up to 4x and reduces storage by over\n100x while maintaining over 95% task performance.\n","authors":["Qingqing Cao","Sewon Min","Yizhong Wang","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2310.01329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01467v1","updated":"2023-10-02T16:43:14Z","published":"2023-10-02T16:43:14Z","title":"FedBPT: Efficient Federated Black-box Prompt Tuning for Large Language\n  Models","summary":"  Pre-trained language models (PLM) have revolutionized the NLP landscape,\nachieving stellar performances across diverse tasks. These models, while\nbenefiting from vast training data, often require fine-tuning on specific data\nto cater to distinct downstream tasks. However, this data adaptation process\nhas inherent security and privacy concerns, primarily when leveraging\nuser-generated, device-residing data. Federated learning (FL) provides a\nsolution, allowing collaborative model fine-tuning without centralized data\ncollection. However, applying FL to finetune PLMs is hampered by challenges,\nincluding restricted model parameter access, high computational requirements,\nand communication overheads. This paper introduces Federated Black-box Prompt\nTuning (FedBPT), a framework designed to address these challenges. FedBPT does\nnot require the clients to access the model parameters. By focusing on training\noptimal prompts and utilizing gradient-free optimization methods, FedBPT\nreduces the number of exchanged variables, boosts communication efficiency, and\nminimizes computational and storage costs. Experiments highlight the\nframework's ability to drastically cut communication and memory costs while\nmaintaining competitive performance. Ultimately, FedBPT presents a promising\nsolution for efficient, privacy-preserving fine-tuning of PLM in the age of\nlarge language models.\n","authors":["Jingwei Sun","Ziyue Xu","Hongxu Yin","Dong Yang","Daguang Xu","Yiran Chen","Holger R. Roth"],"pdf_url":"https://arxiv.org/pdf/2310.01467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01320v1","updated":"2023-10-02T16:27:36Z","published":"2023-10-02T16:27:36Z","title":"Avalon's Game of Thoughts: Battle Against Deception through Recursive\n  Contemplation","summary":"  Recent breakthroughs in large language models (LLMs) have brought remarkable\nsuccess in the field of LLM-as-Agent. Nevertheless, a prevalent assumption is\nthat the information processed by LLMs is consistently honest, neglecting the\npervasive deceptive or misleading information in human society and AI-generated\ncontent. This oversight makes LLMs susceptible to malicious manipulations,\npotentially resulting in detrimental outcomes. This study utilizes the\nintricate Avalon game as a testbed to explore LLMs' potential in deceptive\nenvironments. Avalon, full of misinformation and requiring sophisticated logic,\nmanifests as a \"Game-of-Thoughts\". Inspired by the efficacy of humans'\nrecursive thinking and perspective-taking in the Avalon game, we introduce a\nnovel framework, Recursive Contemplation (ReCon), to enhance LLMs' ability to\nidentify and counteract deceptive information. ReCon combines formulation and\nrefinement contemplation processes; formulation contemplation produces initial\nthoughts and speech, while refinement contemplation further polishes them.\nAdditionally, we incorporate first-order and second-order perspective\ntransitions into these processes respectively. Specifically, the first-order\nallows an LLM agent to infer others' mental states, and the second-order\ninvolves understanding how others perceive the agent's mental state. After\nintegrating ReCon with different LLMs, extensive experiment results from the\nAvalon game indicate its efficacy in aiding LLMs to discern and maneuver around\ndeceptive information without extra fine-tuning and data. Finally, we offer a\npossible explanation for the efficacy of ReCon and explore the current\nlimitations of LLMs in terms of safety, reasoning, speaking style, and format,\npotentially furnishing insights for subsequent research.\n","authors":["Shenzhi Wang","Chang Liu","Zilong Zheng","Siyuan Qi","Shuo Chen","Qisen Yang","Andrew Zhao","Chaofei Wang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2310.01320v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2310.01299v1","updated":"2023-10-02T16:00:37Z","published":"2023-10-02T16:00:37Z","title":"Generating Explanations in Medical Question-Answering by Expectation\n  Maximization Inference over Evidence","summary":"  Medical Question Answering~(medical QA) systems play an essential role in\nassisting healthcare workers in finding answers to their questions. However, it\nis not sufficient to merely provide answers by medical QA systems because users\nmight want explanations, that is, more analytic statements in natural language\nthat describe the elements and context that support the answer. To do so, we\npropose a novel approach for generating natural language explanations for\nanswers predicted by medical QA systems. As high-quality medical explanations\nrequire additional medical knowledge, so that our system extract knowledge from\nmedical textbooks to enhance the quality of explanations during the explanation\ngeneration process. Concretely, we designed an expectation-maximization\napproach that makes inferences about the evidence found in these texts,\noffering an efficient way to focus attention on lengthy evidence passages.\nExperimental results, conducted on two datasets MQAE-diag and MQAE, demonstrate\nthe effectiveness of our framework for reasoning with textual evidence. Our\napproach outperforms state-of-the-art models, achieving a significant\nimprovement of \\textbf{6.86} and \\textbf{9.43} percentage points on the Rouge-1\nscore; \\textbf{8.23} and \\textbf{7.82} percentage points on the Bleu-4 score on\nthe respective datasets.\n","authors":["Wei Sun","Mingxiao Li","Damien Sileo","Jesse Davis","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2310.01299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01297v1","updated":"2023-10-02T15:59:10Z","published":"2023-10-02T15:59:10Z","title":"Co-audit: tools to help humans double-check AI-generated content","summary":"  Users are increasingly being warned to check AI-generated content for\ncorrectness. Still, as LLMs (and other generative models) generate more complex\noutput, such as summaries, tables, or code, it becomes harder for the user to\naudit or evaluate the output for quality or correctness. Hence, we are seeing\nthe emergence of tool-assisted experiences to help the user double-check a\npiece of AI-generated content. We refer to these as co-audit tools. Co-audit\ntools complement prompt engineering techniques: one helps the user construct\nthe input prompt, while the other helps them check the output response. As a\nspecific example, this paper describes recent research on co-audit tools for\nspreadsheet computations powered by generative models. We explain why co-audit\nexperiences are essential for any application of generative AI where quality is\nimportant and errors are consequential (as is common in spreadsheet\ncomputations). We propose a preliminary list of principles for co-audit, and\noutline research challenges.\n","authors":["Andrew D. Gordon","Carina Negreanu","José Cambronero","Rasika Chakravarthy","Ian Drosos","Hao Fang","Bhaskar Mitra","Hannah Richardson","Advait Sarkar","Stephanie Simmons","Jack Williams","Ben Zorn"],"pdf_url":"https://arxiv.org/pdf/2310.01297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01290v1","updated":"2023-10-02T15:43:53Z","published":"2023-10-02T15:43:53Z","title":"Knowledge Crosswords: Geometric Reasoning over Structured Knowledge with\n  Large Language Models","summary":"  Large language models (LLMs) are widely adopted in knowledge-intensive tasks\nand have achieved impressive performance thanks to their knowledge abilities.\nWhile LLMs have demonstrated outstanding performance on atomic or linear\n(multi-hop) QA tasks, whether they can reason in knowledge-rich scenarios with\ninterweaving constraints remains an underexplored problem. In this work, we\npropose geometric reasoning over structured knowledge, where pieces of\nknowledge are connected in a graph structure and models need to fill in the\nmissing information. Such geometric knowledge reasoning would require the\nability to handle structured knowledge, reason with uncertainty, verify facts,\nand backtrack when an error occurs. We propose Knowledge Crosswords, a\nmulti-blank QA dataset where each problem consists of a natural language\nquestion representing the geometric constraints of an incomplete entity\nnetwork, where LLMs are tasked with working out the missing entities while\nmeeting all factual constraints. Knowledge Crosswords contains 2,101 individual\nproblems, covering various knowledge domains and further divided into three\ndifficulty levels. We conduct extensive experiments to evaluate existing LLM\nprompting approaches on the Knowledge Crosswords benchmark. We additionally\npropose two new approaches, Staged Prompting and Verify-All, to augment LLMs'\nability to backtrack and verify structured constraints. Our results demonstrate\nthat while baseline approaches perform well on easier problems but struggle\nwith hard ones, our proposed Verify-All outperforms other methods by a large\nmargin and is more robust with hard problems. Further analysis reveals that\nLLMs' ability of geometric reasoning over structured knowledge is still far\nfrom robust or perfect, susceptible to confounders such as the order of\noptions, certain structural patterns, assumption of existence of correct\nanswer, and more.\n","authors":["Wenxuan Ding","Shangbin Feng","Yuhan Liu","Zhaoxuan Tan","Vidhisha Balachandran","Tianxing He","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2310.01290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01271v1","updated":"2023-10-02T15:16:31Z","published":"2023-10-02T15:16:31Z","title":"LEEC: A Legal Element Extraction Dataset with an Extensive\n  Domain-Specific Label System","summary":"  As a pivotal task in natural language processing, element extraction has\ngained significance in the legal domain. Extracting legal elements from\njudicial documents helps enhance interpretative and analytical capacities of\nlegal cases, and thereby facilitating a wide array of downstream applications\nin various domains of law. Yet existing element extraction datasets are limited\nby their restricted access to legal knowledge and insufficient coverage of\nlabels. To address this shortfall, we introduce a more comprehensive,\nlarge-scale criminal element extraction dataset, comprising 15,831 judicial\ndocuments and 159 labels. This dataset was constructed through two main steps:\nFirst, designing the label system by our team of legal experts based on prior\nlegal research which identified critical factors driving and processes\ngenerating sentencing outcomes in criminal cases; Second, employing the legal\nknowledge to annotate judicial documents according to the label system and\nannotation guideline. The Legal Element ExtraCtion dataset (LEEC) represents\nthe most extensive and domain-specific legal element extraction dataset for the\nChinese legal system. Leveraging the annotated data, we employed various SOTA\nmodels that validates the applicability of LEEC for Document Event Extraction\n(DEE) task. The LEEC dataset is available on https://github.com/THUlawtech/LEEC .\n","authors":["Xue Zongyue","Liu Huanghai","Hu Yiran","Kong Kangle","Wang Chenlu","Liu Yun","Shen Weixing"],"pdf_url":"https://arxiv.org/pdf/2310.01271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01260v1","updated":"2023-10-02T14:51:16Z","published":"2023-10-02T14:51:16Z","title":"SPELL: Semantic Prompt Evolution based on a LLM","summary":"  Prompt engineering is a new paradigm for enhancing the performance of trained\nneural network models. For optimizing text-style prompts, existing methods\nusually individually operate small portions of a text step by step, which\neither breaks the fluency or could not globally adjust a prompt. Since large\nlanguage models (LLMs) have powerful ability of generating coherent texts token\nby token, can we utilize LLMs for improving prompts? Based on this motivation,\nin this paper, considering a trained LLM as a text generator, we attempt to\ndesign a black-box evolution algorithm for automatically optimizing texts,\nnamely SPELL (Semantic Prompt Evolution based on a LLM). The proposed method is\nevaluated with different LLMs and evolution parameters in different text tasks.\nExperimental results show that SPELL could rapidly improve the prompts indeed.\nWe further explore the evolution process and discuss on the limitations,\npotential possibilities and future work.\n","authors":["Yujian Betterest Li","Kai Wu"],"pdf_url":"https://arxiv.org/pdf/2310.01260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01248v1","updated":"2023-10-02T14:32:07Z","published":"2023-10-02T14:32:07Z","title":"Improving Emotional Expression and Cohesion in Image-Based Playlist\n  Description and Music Topics: A Continuous Parameterization Approach","summary":"  Text generation in image-based platforms, particularly for music-related\ncontent, requires precise control over text styles and the incorporation of\nemotional expression. However, existing approaches often need help to control\nthe proportion of external factors in generated text and rely on discrete\ninputs, lacking continuous control conditions for desired text generation. This\nstudy proposes Continuous Parameterization for Controlled Text Generation\n(CPCTG) to overcome these limitations. Our approach leverages a Language Model\n(LM) as a style learner, integrating Semantic Cohesion (SC) and Emotional\nExpression Proportion (EEP) considerations. By enhancing the reward method and\nmanipulating the CPCTG level, our experiments on playlist description and music\ntopic generation tasks demonstrate significant improvements in ROUGE scores,\nindicating enhanced relevance and coherence in the generated text.\n","authors":["Yuelyu Ji","Yuheng Song","Wei Wang","Ruoyi Xu","Zhongqian Xie","Huiyun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.01248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01217v1","updated":"2023-10-02T14:01:36Z","published":"2023-10-02T14:01:36Z","title":"ScaLearn: Simple and Highly Parameter-Efficient Task Transfer by\n  Learning to Scale","summary":"  Multi-task learning (MTL) has shown considerable practical benefits,\nparticularly when using pre-trained language models (PLMs). While this is\ncommonly achieved by simultaneously learning $n$ tasks under a joint\noptimization procedure, recent methods such as AdapterFusion structure the\nproblem into two distinct stages: (i) task learning, where knowledge specific\nto a task is encapsulated within sets of parameters (\\eg adapters), and (ii)\ntransfer, where this already learned knowledge is leveraged for a target task.\nThis separation of concerns provides numerous benefits, such as promoting\nreusability, and addressing cases involving data privacy and societal concerns;\non the flip side, current two-stage MTL methods come with the cost of\nintroducing a substantial number of additional parameters. In this work, we\naddress this issue by leveraging the usefulness of linearly scaling the output\nrepresentations of source adapters for transfer learning. We introduce\nScaLearn, a simple and highly parameter-efficient two-stage MTL method that\ncapitalizes on the knowledge of the source tasks by learning a minimal set of\nscaling parameters that enable effective knowledge transfer to a target task.\nOur experiments on three benchmarks (GLUE, SuperGLUE, and HumSet) show that our\nScaLearn, in addition to facilitating the benefits of two-stage MTL,\nconsistently outperforms strong baselines with only a small number of transfer\nparameters - roughly 0.35% of those of AdapterFusion. Remarkably, we observe\nthat ScaLearn maintains its strong abilities even when further reducing\nparameters through uniform scaling and layer-sharing, achieving similarly\ncompetitive results with only $8$ transfer parameters for each target task. Our\nproposed approach thus demonstrates the power of simple scaling as a promise\nfor more efficient task transfer.\n","authors":["Markus Frohmann","Carolin Holtermann","Shahed Masoudian","Anne Lauscher","Navid Rekabsaz"],"pdf_url":"https://arxiv.org/pdf/2310.01217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01208v1","updated":"2023-10-02T13:53:03Z","published":"2023-10-02T13:53:03Z","title":"Label Supervised LLaMA Finetuning","summary":"  The recent success of Large Language Models (LLMs) has gained significant\nattention in both academia and industry. Substantial efforts have been made to\nenhance the zero- and few-shot generalization capabilities of open-source LLMs\nthrough finetuning. Currently, the prevailing approach is instruction-tuning,\nwhich trains LLMs to complete real-world tasks by generating responses guided\nby natural language instructions. It is worth noticing that such an approach\nmay underperform in sequence and token classification tasks. Unlike text\ngeneration tasks, classification tasks have a limited label space, where\nprecise label prediction is more appreciated than generating diverse and\nhuman-like responses. Prior research has unveiled that instruction-tuned LLMs\ncannot outperform BERT, prompting us to explore the potential of leveraging\nlatent representations from LLMs for supervised label prediction. In this\npaper, we introduce a label-supervised adaptation for LLMs, which aims to\nfinetuning the model with discriminant labels. We evaluate this approach with\nLabel Supervised LLaMA (LS-LLaMA), based on LLaMA-2-7B, a relatively\nsmall-scale LLM, and can be finetuned on a single GeForce RTX4090 GPU. We\nextract latent representations from the final LLaMA layer and project them into\nthe label space to compute the cross-entropy loss. The model is finetuned by\nLow-Rank Adaptation (LoRA) to minimize this loss. Remarkably, without intricate\nprompt engineering or external knowledge, LS-LLaMA substantially outperforms\nLLMs ten times its size in scale and demonstrates consistent improvements\ncompared to robust baselines like BERT-Large and RoBERTa-Large in text\nclassification. Moreover, by removing the causal mask from decoders, LS-unLLaMA\nachieves the state-of-the-art performance in named entity recognition (NER).\nOur work will shed light on a novel approach to adapting LLMs for various\ndownstream tasks.\n","authors":["Zongxi Li","Xianming Li","Yuzhang Liu","Haoran Xie","Jing Li","Fu-lee Wang","Qing Li","Xiaoqin Zhong"],"pdf_url":"https://arxiv.org/pdf/2310.01208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01188v1","updated":"2023-10-02T13:26:43Z","published":"2023-10-02T13:26:43Z","title":"Quantifying the Plausibility of Context Reliance in Neural Machine\n  Translation","summary":"  Establishing whether language models can use contextual information in a\nhuman-plausible way is important to ensure their safe adoption in real-world\nsettings. However, the questions of when and which parts of the context affect\nmodel generations are typically tackled separately, and current plausibility\nevaluations are practically limited to a handful of artificial benchmarks. To\naddress this, we introduce Plausibility Evaluation of Context Reliance\n(PECoRe), an end-to-end interpretability framework designed to quantify context\nusage in language models' generations. Our approach leverages model internals\nto (i) contrastively identify context-sensitive target tokens in generated\ntexts and (ii) link them to contextual cues justifying their prediction. We use\nPECoRe to quantify the plausibility of context-aware machine translation\nmodels, comparing model rationales with human annotations across several\ndiscourse-level phenomena. Finally, we apply our method to unannotated\ngenerations to identify context-mediated predictions and highlight instances of\n(im)plausible context usage in model translations.\n","authors":["Gabriele Sarti","Grzegorz Chrupała","Malvina Nissim","Arianna Bisazza"],"pdf_url":"https://arxiv.org/pdf/2310.01188v1.pdf","comment":"Preprint, under review. 24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.01459v1","updated":"2023-10-02T13:24:00Z","published":"2023-10-02T13:24:00Z","title":"NarrativePlay: Interactive Narrative Understanding","summary":"  In this paper, we introduce NarrativePlay, a novel system that allows users\nto role-play a fictional character and interact with other characters in\nnarratives such as novels in an immersive environment. We leverage Large\nLanguage Models (LLMs) to generate human-like responses, guided by personality\ntraits extracted from narratives. The system incorporates auto-generated visual\ndisplay of narrative settings, character portraits, and character speech,\ngreatly enhancing user experience. Our approach eschews predefined sandboxes,\nfocusing instead on main storyline events extracted from narratives from the\nperspective of a user-selected character. NarrativePlay has been evaluated on\ntwo types of narratives, detective and adventure stories, where users can\neither explore the world or improve their favorability with the narrative\ncharacters through conversations.\n","authors":["Runcong Zhao","Wenjia Zhang","Jiazheng Li","Lixing Zhu","Yanran Li","Yulan He","Lin Gui"],"pdf_url":"https://arxiv.org/pdf/2310.01459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01138v1","updated":"2023-10-02T12:25:05Z","published":"2023-10-02T12:25:05Z","title":"Target-Aware Contextual Political Bias Detection in News","summary":"  Media bias detection requires comprehensive integration of information\nderived from multiple news sources. Sentence-level political bias detection in\nnews is no exception, and has proven to be a challenging task that requires an\nunderstanding of bias in consideration of the context. Inspired by the fact\nthat humans exhibit varying degrees of writing styles, resulting in a diverse\nrange of statements with different local and global contexts, previous work in\nmedia bias detection has proposed augmentation techniques to exploit this fact.\nDespite their success, we observe that these techniques introduce noise by\nover-generalizing bias context boundaries, which hinders performance. To\nalleviate this issue, we propose techniques to more carefully search for\ncontext using a bias-sensitive, target-aware approach for data augmentation.\nComprehensive experiments on the well-known BASIL dataset show that when\ncombined with pre-trained models such as BERT, our augmentation techniques lead\nto state-of-the-art results. Our approach outperforms previous methods\nsignificantly, obtaining an F1-score of 58.15 over state-of-the-art bias\ndetection task.\n","authors":["Iffat Maab","Edison Marrese-Taylor","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2310.01138v1.pdf","comment":"11 pages, 3 figures, conference paper accepted in IJCNLP-AACL 2023\n  but will get published after Nov 4th Bali conference"},{"id":"http://arxiv.org/abs/2310.01132v1","updated":"2023-10-02T12:11:17Z","published":"2023-10-02T12:11:17Z","title":"Automated Evaluation of Classroom Instructional Support with LLMs and\n  BoWs: Connecting Global Predictions to Specific Feedback","summary":"  With the aim to provide teachers with more specific, frequent, and actionable\nfeedback about their teaching, we explore how Large Language Models (LLMs) can\nbe used to estimate ``Instructional Support'' domain scores of the CLassroom\nAssessment Scoring System (CLASS), a widely used observation protocol. We\ndesign a machine learning architecture that uses either zero-shot prompting of\nMeta's Llama2, and/or a classic Bag of Words (BoW) model, to classify\nindividual utterances of teachers' speech (transcribed automatically using\nOpenAI's Whisper) for the presence of 11 behavioral indicators of Instructional\nSupport. Then, these utterance-level judgments are aggregated over an entire\n15-min observation session to estimate a global CLASS score. Experiments on two\nCLASS-coded datasets of toddler and pre-kindergarten classrooms indicate that\n(1) automatic CLASS Instructional Support estimation accuracy using the\nproposed method (Pearson $R$ up to $0.46$) approaches human inter-rater\nreliability (up to $R=0.55$); (2) LLMs yield slightly greater accuracy than BoW\nfor this task; and (3) the best models often combined features extracted from\nboth LLM and BoW. Finally, (4) we illustrate how the model's outputs can be\nvisualized at the utterance level to provide teachers with explainable feedback\non which utterances were most positively or negatively correlated with specific\nCLASS dimensions.\n","authors":["Jacob Whitehill","Jennifer LoCasale-Crouch"],"pdf_url":"https://arxiv.org/pdf/2310.01132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01119v1","updated":"2023-10-02T11:49:05Z","published":"2023-10-02T11:49:05Z","title":"Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large\n  Language Models","summary":"  The in-context learning ability of large language models (LLMs) enables them\nto generalize to novel downstream tasks with relatively few labeled examples.\nHowever, they require enormous computational resources to be deployed.\nAlternatively, smaller models can solve specific tasks if fine-tuned with\nenough labeled examples. These examples, however, are expensive to obtain. In\npursuit of the best of both worlds, we study the annotation and generation of\nfine-tuning training data via fine-tuned teacher LLMs to improve the downstream\nperformance of much smaller models. In four text classification and two text\ngeneration tasks, we find that both data generation and annotation dramatically\nimprove the respective downstream model's performance, occasionally\nnecessitating only a minor fraction of the original training dataset.\n","authors":["Jean Kaddour","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2310.01119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01089v1","updated":"2023-10-02T11:03:57Z","published":"2023-10-02T11:03:57Z","title":"GraphText: Graph Reasoning in Text Space","summary":"  Large Language Models (LLMs) have gained the ability to assimilate human\nknowledge and facilitate natural language interactions with both humans and\nother LLMs. However, despite their impressive achievements, LLMs have not made\nsignificant advancements in the realm of graph machine learning. This\nlimitation arises because graphs encapsulate distinct relational data, making\nit challenging to transform them into natural language that LLMs understand. In\nthis paper, we bridge this gap with a novel framework, GraphText, that\ntranslates graphs into natural language. GraphText derives a graph-syntax tree\nfor each graph that encapsulates both the node attributes and inter-node\nrelationships. Traversal of the tree yields a graph text sequence, which is\nthen processed by an LLM to treat graph tasks as text generation tasks.\nNotably, GraphText offers multiple advantages. It introduces training-free\ngraph reasoning: even without training on graph data, GraphText with ChatGPT\ncan achieve on par with, or even surpassing, the performance of\nsupervised-trained graph neural networks through in-context learning (ICL).\nFurthermore, GraphText paves the way for interactive graph reasoning, allowing\nboth humans and LLMs to communicate with the model seamlessly using natural\nlanguage. These capabilities underscore the vast, yet-to-be-explored potential\nof LLMs in the domain of graph machine learning.\n","authors":["Jianan Zhao","Le Zhuo","Yikang Shen","Meng Qu","Kai Liu","Michael Bronstein","Zhaocheng Zhu","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2310.01089v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2310.01088v1","updated":"2023-10-02T11:03:20Z","published":"2023-10-02T11:03:20Z","title":"Towards human-like spoken dialogue generation between AI agents from\n  written dialogue","summary":"  The advent of large language models (LLMs) has made it possible to generate\nnatural written dialogues between two agents. However, generating human-like\nspoken dialogues from these written dialogues remains challenging. Spoken\ndialogues have several unique characteristics: they frequently include\nbackchannels and laughter, and the smoothness of turn-taking significantly\ninfluences the fluidity of conversation. This study proposes CHATS - CHatty\nAgents Text-to-Speech - a discrete token-based system designed to generate\nspoken dialogues based on written dialogues. Our system can generate speech for\nboth the speaker side and the listener side simultaneously, using only the\ntranscription from the speaker side, which eliminates the need for\ntranscriptions of backchannels or laughter. Moreover, CHATS facilitates natural\nturn-taking; it determines the appropriate duration of silence after each\nutterance in the absence of overlap, and it initiates the generation of\noverlapping speech based on the phoneme sequence of the next utterance in case\nof overlap. Experimental evaluations indicate that CHATS outperforms the\ntext-to-speech baseline, producing spoken dialogues that are more interactive\nand fluid while retaining clarity and intelligibility.\n","authors":["Kentaro Mitsui","Yukiya Hono","Kei Sawada"],"pdf_url":"https://arxiv.org/pdf/2310.01088v1.pdf","comment":"18 pages, 8 figures, 9 tables, audio samples:\n  https://rinnakk.github.io/research/publications/CHATS/"},{"id":"http://arxiv.org/abs/2310.01074v1","updated":"2023-10-02T10:35:23Z","published":"2023-10-02T10:35:23Z","title":"Back to the Future: Towards Explainable Temporal Reasoning with Large\n  Language Models","summary":"  Temporal reasoning is a crucial NLP task, providing a nuanced understanding\nof time-sensitive contexts within textual data. Although recent advancements in\nLLMs have demonstrated their potential in temporal reasoning, the predominant\nfocus has been on tasks such as temporal expression and temporal relation\nextraction. These tasks are primarily designed for the extraction of direct and\npast temporal cues and to engage in simple reasoning processes. A significant\ngap remains when considering complex reasoning tasks such as event forecasting,\nwhich requires multi-step temporal reasoning on events and prediction on the\nfuture timestamp. Another notable limitation of existing methods is their\nincapability to provide an illustration of their reasoning process, hindering\nexplainability. In this paper, we introduce the first task of explainable\ntemporal reasoning, to predict an event's occurrence at a future timestamp\nbased on context which requires multiple reasoning over multiple events, and\nsubsequently provide a clear explanation for their prediction. Our task offers\na comprehensive evaluation of both the LLMs' complex temporal reasoning\nability, the future event prediction ability, and explainability-a critical\nattribute for AI applications. To support this task, we present the first\nmulti-source instruction-tuning dataset of explainable temporal reasoning\n(ExpTime) with 26k derived from the temporal knowledge graph datasets and their\ntemporal reasoning paths, using a novel knowledge-graph-instructed-generation\nstrategy. Based on the dataset, we propose the first open-source LLM series\nTimeLlaMA based on the foundation LlaMA2, with the ability of instruction\nfollowing for explainable temporal reasoning. We compare the performance of our\nmethod and a variety of LLMs, where our method achieves the state-of-the-art\nperformance of temporal prediction and explanation.\n","authors":["Chenhan Yuan","Qianqian Xie","Jimin Huang","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2310.01074v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.01061v1","updated":"2023-10-02T10:14:43Z","published":"2023-10-02T10:14:43Z","title":"Reasoning on Graphs: Faithful and Interpretable Large Language Model\n  Reasoning","summary":"  Large language models (LLMs) have demonstrated impressive reasoning abilities\nin complex tasks. However, they lack up-to-date knowledge and experience\nhallucinations during reasoning, which can lead to incorrect reasoning\nprocesses and diminish their performance and trustworthiness. Knowledge graphs\n(KGs), which capture vast amounts of facts in a structured format, offer a\nreliable source of knowledge for reasoning. Nevertheless, existing KG-based LLM\nreasoning methods only treat KGs as factual knowledge bases and overlook the\nimportance of their structural information for reasoning. In this paper, we\npropose a novel method called reasoning on graphs (RoG) that synergizes LLMs\nwith KGs to enable faithful and interpretable reasoning. Specifically, we\npresent a planning-retrieval-reasoning framework, where RoG first generates\nrelation paths grounded by KGs as faithful plans. These plans are then used to\nretrieve valid reasoning paths from the KGs for LLMs to conduct faithful\nreasoning. Furthermore, RoG not only distills knowledge from KGs to improve the\nreasoning ability of LLMs through training but also allows seamless integration\nwith any arbitrary LLMs during inference. Extensive experiments on two\nbenchmark KGQA datasets demonstrate that RoG achieves state-of-the-art\nperformance on KG reasoning tasks and generates faithful and interpretable\nreasoning results.\n","authors":["Linhao Luo","Yuan-Fang Li","Gholamreza Haffari","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2310.01061v1.pdf","comment":"22 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.01045v1","updated":"2023-10-02T09:47:40Z","published":"2023-10-02T09:47:40Z","title":"Tool-Augmented Reward Modeling","summary":"  Reward modeling (a.k.a., preference modeling) is instrumental for aligning\nlarge language models with human preferences, particularly within the context\nof reinforcement learning from human feedback (RLHF). While conventional reward\nmodels (RMs) have exhibited remarkable scalability, they oft struggle with\nfundamental functionality such as arithmetic computation, code execution, and\nfactual lookup. In this paper, we propose a tool-augmented preference modeling\napproach, named \\name, to address these limitations by empowering RMs with\naccess to external environments, including calculators and search engines. This\napproach not only fosters synergy between tool utilization and reward grading\nbut also enhances interpretive capacity and scoring reliability. Our study\ndelves into the integration of external tools into RMs, enabling them to\ninteract with diverse external sources and construct task-specific tool\nengagement and reasoning traces in an autoregressive manner. We validate our\napproach across a wide range of domains, incorporating seven distinct external\ntools. Our experimental results demonstrate a noteworthy overall improvement of\n17.7% across eight tasks in preference ranking. Furthermore, our approach\noutperforms Gopher 280B by 7.3% on TruthfulQA task in zero-shot evaluation. In\nhuman evaluations, RLHF trained with Themis attains an average win rate of 32%\nwhen compared to baselines across four distinct tasks. Additionally, we provide\na comprehensive collection of tool-related RM datasets, incorporating data from\nseven distinct tool APIs, totaling 15,000 instances. We anticipate that this\npublicly available dataset will facilitate and inspire further research\nadvancements in the field.\n","authors":["Lei Li","Yekun Chai","Shuohuan Wang","Yu Sun","Hao Tian","Ningyu Zhang","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2310.01045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01041v1","updated":"2023-10-02T09:35:27Z","published":"2023-10-02T09:35:27Z","title":"Language Model Decoding as Direct Metrics Optimization","summary":"  Despite the remarkable advances in language modeling, current mainstream\ndecoding methods still struggle to generate texts that align with human texts\nacross different aspects. In particular, sampling-based methods produce\nless-repetitive texts which are often disjunctive in discourse, while\nsearch-based methods maintain topic coherence at the cost of increased\nrepetition. Overall, these methods fall short in achieving holistic alignment\nacross a broad range of aspects. In this work, we frame decoding from a\nlanguage model as an optimization problem with the goal of strictly matching\nthe expected performance with human texts measured by multiple metrics of\ndesired aspects simultaneously. The resulting decoding distribution enjoys an\nanalytical solution that scales the input language model distribution via a\nsequence-level energy function defined by these metrics. And most importantly,\nwe prove that this induced distribution is guaranteed to improve the perplexity\non human texts, which suggests a better approximation to the underlying\ndistribution of human texts. To facilitate tractable sampling from this\nglobally normalized distribution, we adopt the Sampling-Importance-Resampling\ntechnique. Experiments on various domains and model scales demonstrate the\nsuperiority of our method in metrics alignment with human texts and human\nevaluation over strong baselines.\n","authors":["Haozhe Ji","Pei Ke","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2310.01041v1.pdf","comment":"Preprint. 28 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.00996v1","updated":"2023-10-02T08:58:29Z","published":"2023-10-02T08:58:29Z","title":"ARN: A Comprehensive Framework and Dataset for Analogical Reasoning on\n  Narratives","summary":"  Analogical reasoning is one of the prime abilities of humans and is linked to\ncreativity and scientific discoveries. This ability has been studied\nextensively in natural language processing (NLP) as well as in cognitive\npsychology by proposing various benchmarks and evaluation setups. Yet, a\nsubstantial gap exists between evaluations of analogical reasoning in cognitive\npsychology and NLP. Our aim is to bridge this by computationally adapting\ntheories related to analogical reasoning from cognitive psychology in the\ncontext of narratives and developing an evaluation framework large in scale.\nMore concretely, we propose the task of matching narratives based on system\nmappings and release the Analogical Reasoning on Narratives (ARN) dataset. To\ncreate the dataset, we devise a framework inspired by cognitive psychology\ntheories about analogical reasoning to utilize narratives and their components\nto form mappings of different abstractness levels. These mappings are then\nleveraged to create pairs of analogies and disanalogies/distractors with more\nthan 1k triples of query narratives, analogies, and distractors. We cover four\ncategories of far/near analogies and far/near distractors that allow us to\nstudy analogical reasoning in models from distinct perspectives. In this study,\nwe evaluate different large language models (LLMs) on this task. Our results\ndemonstrate that LLMs struggle to recognize higher-order mappings when they are\nnot accompanied by lower-order mappings (far analogies) and show better\nperformance when all mappings are present simultaneously (near analogies). We\nobserve that in all the settings, the analogical reasoning abilities of LLMs\ncan be easily impaired by near distractors that form lower-order mappings with\nthe query narratives.\n","authors":["Zhivar Sourati","Filip Ilievski","Pia Sommerauer"],"pdf_url":"https://arxiv.org/pdf/2310.00996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00970v1","updated":"2023-10-02T08:22:34Z","published":"2023-10-02T08:22:34Z","title":"EALM: Introducing Multidimensional Ethical Alignment in Conversational\n  Information Retrieval","summary":"  Artificial intelligence (AI) technologies should adhere to human norms to\nbetter serve our society and avoid disseminating harmful or misleading\ninformation, particularly in Conversational Information Retrieval (CIR).\nPrevious work, including approaches and datasets, has not always been\nsuccessful or sufficiently robust in taking human norms into consideration. To\nthis end, we introduce a workflow that integrates ethical alignment, with an\ninitial ethical judgment stage for efficient data screening. To address the\nneed for ethical judgment in CIR, we present the QA-ETHICS dataset, adapted\nfrom the ETHICS benchmark, which serves as an evaluation tool by unifying\nscenarios and label meanings. However, each scenario only considers one ethical\nconcept. Therefore, we introduce the MP-ETHICS dataset to evaluate a scenario\nunder multiple ethical concepts, such as justice and Deontology. In addition,\nwe suggest a new approach that achieves top performance in both binary and\nmulti-label ethical judgment tasks. Our research provides a practical method\nfor introducing ethical alignment into the CIR workflow. The data and code are\navailable at https://github.com/wanng-ide/ealm .\n","authors":["Yiyao Yu","Junjie Wang","Yuxiang Zhang","Lin Zhang","Yujiu Yang","Tetsuya Sakai"],"pdf_url":"https://arxiv.org/pdf/2310.00970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00935v1","updated":"2023-10-02T06:57:45Z","published":"2023-10-02T06:57:45Z","title":"Resolving Knowledge Conflicts in Large Language Models","summary":"  Large language models (LLMs) often encounter knowledge conflicts, scenarios\nwhere discrepancy arises between the internal parametric knowledge of LLMs and\nnon-parametric information provided in the prompt context. In this work we ask\nwhat are the desiderata for LLMs when a knowledge conflict arises and whether\nexisting LLMs fulfill them. We posit that LLMs should 1) identify knowledge\nconflicts, 2) pinpoint conflicting information segments, and 3) provide\ndistinct answers or viewpoints in conflicting scenarios. To this end, we\nintroduce KNOWLEDGE CONFLICT, an evaluation framework for simulating contextual\nknowledge conflicts and quantitatively evaluating to what extent LLMs achieve\nthese goals. KNOWLEDGE CONFLICT includes diverse and complex situations of\nknowledge conflict, knowledge from diverse entities and domains, two synthetic\nconflict creation methods, and settings with progressively increasing\ndifficulty to reflect realistic knowledge conflicts. Extensive experiments with\nthe KNOWLEDGE CONFLICT framework reveal that while LLMs perform well in\nidentifying the existence of knowledge conflicts, they struggle to determine\nthe specific conflicting knowledge and produce a response with distinct answers\namidst conflicting information. To address these challenges, we propose new\ninstruction-based approaches that augment LLMs to better achieve the three\ngoals. Further analysis shows that abilities to tackle knowledge conflicts are\ngreatly impacted by factors such as knowledge domain and prompt text, while\ngenerating robust responses to knowledge conflict scenarios remains an open\nresearch question.\n","authors":["Yike Wang","Shangbin Feng","Heng Wang","Weijia Shi","Vidhisha Balachandran","Tianxing He","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2310.00935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01452v1","updated":"2023-10-02T06:57:25Z","published":"2023-10-02T06:57:25Z","title":"Fooling the Textual Fooler via Randomizing Latent Representations","summary":"  Despite outstanding performance in a variety of NLP tasks, recent studies\nhave revealed that NLP models are vulnerable to adversarial attacks that\nslightly perturb the input to cause the models to misbehave. Among these\nattacks, adversarial word-level perturbations are well-studied and effective\nattack strategies. Since these attacks work in black-box settings, they do not\nrequire access to the model architecture or model parameters and thus can be\ndetrimental to existing NLP applications. To perform an attack, the adversary\nqueries the victim model many times to determine the most important words in an\ninput text and to replace these words with their corresponding synonyms. In\nthis work, we propose a lightweight and attack-agnostic defense whose main goal\nis to perplex the process of generating an adversarial example in these\nquery-based black-box attacks; that is to fool the textual fooler. This\ndefense, named AdvFooler, works by randomizing the latent representation of the\ninput at inference time. Different from existing defenses, AdvFooler does not\nnecessitate additional computational overhead during training nor relies on\nassumptions about the potential adversarial perturbation set while having a\nnegligible impact on the model's accuracy. Our theoretical and empirical\nanalyses highlight the significance of robustness resulting from confusing the\nadversary via randomizing the latent space, as well as the impact of\nrandomization on clean accuracy. Finally, we empirically demonstrate near\nstate-of-the-art robustness of AdvFooler against representative adversarial\nword-level attacks on two benchmark datasets.\n","authors":["Duy C. Hoang","Quang H. Nguyen","Saurav Manchanda","MinLong Peng","Kok-Seng Wong","Khoa D. Doan"],"pdf_url":"https://arxiv.org/pdf/2310.01452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00905v1","updated":"2023-10-02T05:23:34Z","published":"2023-10-02T05:23:34Z","title":"All Languages Matter: On the Multilingual Safety of Large Language\n  Models","summary":"  Safety lies at the core of developing and deploying large language models\n(LLMs). However, previous safety benchmarks only concern the safety in one\nlanguage, e.g. the majority language in the pretraining data such as English.\nIn this work, we build the first multilingual safety benchmark for LLMs,\nXSafety, in response to the global deployment of LLMs in practice. XSafety\ncovers 14 kinds of commonly used safety issues across 10 languages that span\nseveral language families. We utilize XSafety to empirically study the\nmultilingual safety for 4 widely-used LLMs, including both close-API and\nopen-source models. Experimental results show that all LLMs produce\nsignificantly more unsafe responses for non-English queries than English ones,\nindicating the necessity of developing safety alignment for non-English\nlanguages. In addition, we propose several simple and effective prompting\nmethods to improve the multilingual safety of ChatGPT by evoking safety\nknowledge and improving cross-lingual generalization of safety alignment. Our\nprompting method can significantly reduce the ratio of unsafe responses from\n19.1% to 9.7% for non-English queries. We release our data at\nhttps://github.com/Jarviswang94/Multilingual_safety_benchmark.\n","authors":["Wenxuan Wang","Zhaopeng Tu","Chang Chen","Youliang Yuan","Jen-tse Huang","Wenxiang Jiao","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2310.00905v1.pdf","comment":"The first multilingual safety benchmark for large language models"},{"id":"http://arxiv.org/abs/2310.00901v1","updated":"2023-10-02T04:42:53Z","published":"2023-10-02T04:42:53Z","title":"TADIS: Steering Models for Deep-Thinking about Demonstration Examples","summary":"  Instruction tuning has been demonstrated that could significantly improve the\nzero-shot generalization capability to unseen tasks by an apparent margin. By\nincorporating additional context (e.g., task definition, examples) during the\nfine-tuning process, Large Language Models (LLMs) achieved much higher\nperformance than before. However, recent work reported that delusive task\nexamples can achieve almost the same performance as correct task examples,\nindicating the input-label correspondence is less important than previously\nthought. Intrigued by this counter-intuitive observation, we suspect models\nhave the same illusion of competence as humans. Therefore, we propose a novel\nmethod called TADIS that steers LLMs for \"Deep-Thinking'' about demonstration\nexamples instead of merely seeing. To alleviate the illusion of competence of\nmodels, we first ask the model to verify the correctness of shown examples.\nThen, using the verification results as conditions to elicit models for a\nbetter answer. Our experimental results show that TADIS consistently\noutperforms competitive baselines on in-domain and out-domain tasks (improving\n2.79 and 4.03 average ROUGLE-L on out-domain and in-domain datasets,\nrespectively). Despite the presence of generated examples (not all of the\nthinking labels are accurate), TADIS can notably enhance performance in\nzero-shot and few-shot settings. This also suggests that our approach can be\nadopted on a large scale to improve the instruction following capabilities of\nmodels without any manual labor. Moreover, we construct three types of thinking\nlabels with different model sizes and find that small models learn from the\nformat of TADIS but larger models can be steered for \"Deep-Thinking''.\n","authors":["Tianci Xue","Ziqi Wang","Yixia Li","Yun Chen","Guanhua Chen"],"pdf_url":"https://arxiv.org/pdf/2310.00901v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.00900v1","updated":"2023-10-02T04:36:39Z","published":"2023-10-02T04:36:39Z","title":"uSee: Unified Speech Enhancement and Editing with Conditional Diffusion\n  Models","summary":"  Speech enhancement aims to improve the quality of speech signals in terms of\nquality and intelligibility, and speech editing refers to the process of\nediting the speech according to specific user needs. In this paper, we propose\na Unified Speech Enhancement and Editing (uSee) model with conditional\ndiffusion models to handle various tasks at the same time in a generative\nmanner. Specifically, by providing multiple types of conditions including\nself-supervised learning embeddings and proper text prompts to the score-based\ndiffusion model, we can enable controllable generation of the unified speech\nenhancement and editing model to perform corresponding actions on the source\nspeech. Our experiments show that our proposed uSee model can achieve superior\nperformance in both speech denoising and dereverberation compared to other\nrelated generative speech enhancement models, and can perform speech editing\ngiven desired environmental sound text description, signal-to-noise ratios\n(SNR), and room impulse responses (RIR). Demos of the generated speech are\navailable at https://muqiaoy.github.io/usee.\n","authors":["Muqiao Yang","Chunlei Zhang","Yong Xu","Zhongweiyang Xu","Heming Wang","Bhiksha Raj","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2310.00900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00898v1","updated":"2023-10-02T04:29:40Z","published":"2023-10-02T04:29:40Z","title":"Enable Language Models to Implicitly Learn Self-Improvement From Data","summary":"  Large Language Models (LLMs) have demonstrated remarkable capabilities in\nopen-ended text generation tasks. However, the inherent open-ended nature of\nthese tasks implies that there is always room for improvement in the quality of\nmodel responses. To address this challenge, various approaches have been\nproposed to enhance the performance of LLMs. There has been a growing focus on\nenabling LLMs to self-improve their response quality, thereby reducing the\nreliance on extensive human annotation efforts for collecting diverse and\nhigh-quality training data. Recently, prompting-based methods have been widely\nexplored among self-improvement methods owing to their effectiveness,\nefficiency, and convenience. However, those methods usually require explicitly\nand thoroughly written rubrics as inputs to LLMs. It is expensive and\nchallenging to manually derive and provide all necessary rubrics with a\nreal-world complex goal for improvement (e.g., being more helpful and less\nharmful). To this end, we propose an ImPlicit Self-ImprovemenT (PIT) framework\nthat implicitly learns the improvement goal from human preference data. PIT\nonly requires preference data that are used to train reward models without\nextra human efforts. Specifically, we reformulate the training objective of\nreinforcement learning from human feedback (RLHF) -- instead of maximizing\nresponse quality for a given input, we maximize the quality gap of the response\nconditioned on a reference response. In this way, PIT is implicitly trained\nwith the improvement goal of better aligning with human preferences.\nExperiments on two real-world datasets and one synthetic dataset show that our\nmethod significantly outperforms prompting-based methods.\n","authors":["Ziqi Wang","Le Hou","Tianjian Lu","Yuexin Wu","Yunxuan Li","Hongkun Yu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2310.00898v1.pdf","comment":"28 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2310.00892v1","updated":"2023-10-02T04:17:35Z","published":"2023-10-02T04:17:35Z","title":"No Offense Taken: Eliciting Offensiveness from Language Models","summary":"  This work was completed in May 2022.\n  For safe and reliable deployment of language models in the real world,\ntesting needs to be robust. This robustness can be characterized by the\ndifficulty and diversity of the test cases we evaluate these models on.\nLimitations in human-in-the-loop test case generation has prompted an advent of\nautomated test case generation approaches. In particular, we focus on Red\nTeaming Language Models with Language Models by Perez et al.(2022). Our\ncontributions include developing a pipeline for automated test case generation\nvia red teaming that leverages publicly available smaller language models\n(LMs), experimenting with different target LMs and red classifiers, and\ngenerating a corpus of test cases that can help in eliciting offensive\nresponses from widely deployed LMs and identifying their failure modes.\n","authors":["Anugya Srivastava","Rahul Ahuja","Rohith Mukku"],"pdf_url":"https://arxiv.org/pdf/2310.00892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00867v1","updated":"2023-10-02T03:12:06Z","published":"2023-10-02T03:12:06Z","title":"(Dynamic) Prompting might be all you need to repair Compressed LLMs","summary":"  Large language models (LLMs), while transformative for NLP, come with\nsignificant computational demands, underlining the need for efficient,\ntraining-free compression. Notably, the reliability of perplexity as a\nbenchmark for compressed model efficacy is in question, as our tests using\nLLaMA-7B and OPT-6.7b reveal a significant performance drop in several\nrealistic downstream tasks, underscoring the disparity between perplexity as a\nperformance indicator and real-world performance. Investigation into the\ntrade-off between resource-intensive post-compression re-training highlights\nthe prospect of prompt-driven recovery as a lightweight adaption tool. However,\nexisting studies, confined mainly to perplexity evaluations and simple tasks,\nfail to offer unequivocal confidence in the scalability and generalizability of\nprompting. We tackle this uncertainty in two key ways. First, we uncover the\nvulnerability of naive prompts in LLM compression as an over-reliance on a\nsingular prompt per input. In response, we propose inference-time dynamic\nprompting (IDP), a mechanism that autonomously chooses from a set of curated\nprompts based on the context of each individual input. Second, we delve into a\nscientific understanding of why ``prompting might be all you need post-LLM\ncompression\". Our findings suggest that compression doesn't irretrievably erase\nLLM model knowledge but displace it, necessitating a new inference path. IDP\neffectively redirects this path, enabling the model to tap into its inherent\nyet displaced knowledge and thereby recover performance. Empirical tests affirm\nthe value of IDP, demonstrating an average performance improvement of 1.24%\nacross nine varied tasks spanning multiple knowledge domains.\n","authors":["Duc N. M Hoang","Minsik Cho","Thomas Merth","Mohammad Rastegari","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.00867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00863v1","updated":"2023-10-02T02:53:29Z","published":"2023-10-02T02:53:29Z","title":"Melody-conditioned lyrics generation via fine-tuning language model and\n  its evaluation with ChatGPT","summary":"  We leverage character-level language models for syllable-level lyrics\ngeneration from symbolic melody. By fine-tuning a character-level pre-trained\nmodel, we integrate language knowledge into the beam search of a syllable-level\nTransformer generator. Using ChatGPT-based evaluations, we demonstrate enhanced\ncoherence and correctness in the generated lyrics.\n","authors":["Zhe Zhang","Karol Lasocki","Yi Yu","Atsuhiro Takasu"],"pdf_url":"https://arxiv.org/pdf/2310.00863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00845v1","updated":"2023-10-02T01:42:28Z","published":"2023-10-02T01:42:28Z","title":"Application of frozen large-scale models to multimodal task-oriented\n  dialogue","summary":"  In this study, we use the existing Large Language Models ENnhanced to See\nFramework (LENS Framework) to test the feasibility of multimodal task-oriented\ndialogues. The LENS Framework has been proposed as a method to solve computer\nvision tasks without additional training and with fixed parameters of\npre-trained models. We used the Multimodal Dialogs (MMD) dataset, a multimodal\ntask-oriented dialogue benchmark dataset from the fashion field, and for the\nevaluation, we used the ChatGPT-based G-EVAL, which only accepts textual\nmodalities, with arrangements to handle multimodal data. Compared to\nTransformer-based models in previous studies, our method demonstrated an\nabsolute lift of 10.8% in fluency, 8.8% in usefulness, and 5.2% in relevance\nand coherence. The results show that using large-scale models with fixed\nparameters rather than using models trained on a dataset from scratch improves\nperformance in multimodal task-oriented dialogues. At the same time, we show\nthat Large Language Models (LLMs) are effective for multimodal task-oriented\ndialogues. This is expected to lead to efficient applications to existing\nsystems.\n","authors":["Tatsuki Kawamoto","Takuma Suzuki","Ko Miyama","Takumi Meguro","Tomohiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2310.00845v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.07545v2","updated":"2023-10-02T17:50:11Z","published":"2023-08-15T03:22:40Z","title":"Vision-Language Dataset Distillation","summary":"  Dataset distillation methods promise to reduce large-scale datasets down to\nsignificantly smaller sets of (potentially synthetic) training examples, which\npreserve sufficient information for training a new model from scratch. So far,\ndataset distillation methods have been developed for image classification.\nHowever, with the rise in capabilities of vision-language models (VLMs), and\nespecially given the scale of datasets necessary to train these models, the\ntime is ripe to expand dataset distillation methods beyond image\nclassification. In this work, we take the first steps towards this goal by\nexpanding the idea of trajectory matching to create a distillation method for\nvision-language datasets. A key challenge is that vision-language datasets do\nnot have a set of discrete classes. To overcome this, our proposed\nvision-language dataset distillation method jointly distills the image-text\npairs in a contrastive formulation. Since there are no existing baselines, we\ncompare our approach to three coreset selection methods (strategic subsampling\nof the training dataset), which we adapt to the vision-language setting. We\ndemonstrate significant improvements on the challenging Flickr30K and COCO\nretrieval benchmarks: for example, on Flickr30K, the best coreset selection\nmethod selecting 1000 image-text pairs for training achieves only 5.6%\nimage-to-text retrieval accuracy (i.e., recall@1); in contrast, our dataset\ndistillation approach almost doubles that to 9.9% with just 100 (an order of\nmagnitude fewer) training pairs.\n","authors":["Xindi Wu","Byron Zhang","Zhiwei Deng","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2308.07545v2.pdf","comment":"27 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.10592v2","updated":"2023-10-02T16:38:35Z","published":"2023-04-20T18:25:35Z","title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large\n  Language Models","summary":"  The recent GPT-4 has demonstrated extraordinary multi-modal abilities, such\nas directly generating websites from handwritten text and identifying humorous\nelements within images. These features are rarely observed in previous\nvision-language models. However, the technical details behind GPT-4 continue to\nremain undisclosed. We believe that the enhanced multi-modal generation\ncapabilities of GPT-4 stem from the utilization of sophisticated large language\nmodels (LLM). To examine this phenomenon, we present MiniGPT-4, which aligns a\nfrozen visual encoder with a frozen advanced LLM, Vicuna, using one projection\nlayer. Our work, for the first time, uncovers that properly aligning the visual\nfeatures with an advanced large language model can possess numerous advanced\nmulti-modal abilities demonstrated by GPT-4, such as detailed image description\ngeneration and website creation from hand-drawn drafts. Furthermore, we also\nobserve other emerging capabilities in MiniGPT-4, including writing stories and\npoems inspired by given images, teaching users how to cook based on food\nphotos, and so on. In our experiment, we found that the model trained on short\nimage caption pairs could produce unnatural language outputs (e.g., repetition\nand fragmentation). To address this problem, we curate a detailed image\ndescription dataset in the second stage to finetune the model, which\nconsequently improves the model's generation reliability and overall usability.\nOur code, pre-trained model, and collected dataset are available at\nhttps://minigpt-4.github.io/.\n","authors":["Deyao Zhu","Jun Chen","Xiaoqian Shen","Xiang Li","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2304.10592v2.pdf","comment":"Project Website: https://minigpt-4.github.io/; Code, Pretrained\n  Model, and Dataset: https://github.com/Vision-CAIR/MiniGPT-4; Deyao Zhu and\n  Jun Chen contributed equally to this work"},{"id":"http://arxiv.org/abs/2305.17102v2","updated":"2023-10-02T16:23:03Z","published":"2023-05-26T17:15:22Z","title":"GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot\n  Attention for Vision-and-Language Navigation","summary":"  Most existing works solving Room-to-Room VLN problem only utilize RGB images\nand do not consider local context around candidate views, which lack sufficient\nvisual cues about surrounding environment. Moreover, natural language contains\ncomplex semantic information thus its correlations with visual inputs are hard\nto model merely with cross attention. In this paper, we propose GeoVLN, which\nlearns Geometry-enhanced visual representation based on slot attention for\nrobust Visual-and-Language Navigation. The RGB images are compensated with the\ncorresponding depth maps and normal maps predicted by Omnidata as visual\ninputs. Technically, we introduce a two-stage module that combine local slot\nattention and CLIP model to produce geometry-enhanced representation from such\ninput. We employ V&L BERT to learn a cross-modal representation that\nincorporate both language and vision informations. Additionally, a novel\nmultiway attention module is designed, encouraging different phrases of input\ninstruction to exploit the most related features from visual input. Extensive\nexperiments demonstrate the effectiveness of our newly designed modules and\nshow the compelling performance of the proposed method.\n","authors":["Jingyang Huo","Qiang Sun","Boyan Jiang","Haitao Lin","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2305.17102v2.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2307.10829v4","updated":"2023-10-02T15:39:38Z","published":"2023-07-10T12:18:18Z","title":"Exact Diffusion Inversion via Bi-directional Integration Approximation","summary":"  Recently, various methods have been proposed to address the inconsistency\nissue of DDIM inversion to enable image editing, such as EDICT [36] and\nNull-text inversion [22]. However, the above methods introduce considerable\ncomputational overhead. In this paper, we propose a new technique, named\n\\emph{bi-directional integration approximation} (BDIA), to perform exact\ndiffusion inversion with neglible computational overhead. Suppose we would like\nto estimate the next diffusion state $\\boldsymbol{z}_{i-1}$ at timestep $t_i$\nwith the historical information $(i,\\boldsymbol{z}_i)$ and\n$(i+1,\\boldsymbol{z}_{i+1})$. We first obtain the estimated Gaussian noise\n$\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i)$, and then apply the DDIM\nupdate procedure twice for approximating the ODE integration over the next\ntime-slot $[t_i, t_{i-1}]$ in the forward manner and the previous time-slot\n$[t_i, t_{t+1}]$ in the backward manner. The DDIM step for the previous\ntime-slot is used to refine the integration approximation made earlier when\ncomputing $\\boldsymbol{z}_i$. A nice property of BDIA-DDIM is that the update\nexpression for $\\boldsymbol{z}_{i-1}$ is a linear combination of\n$(\\boldsymbol{z}_{i+1}, \\boldsymbol{z}_i,\n\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i))$. This allows for exact\nbackward computation of $\\boldsymbol{z}_{i+1}$ given $(\\boldsymbol{z}_i,\n\\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. It is\ndemonstrated with experiments that (round-trip) BDIA-DDIM is particularly\neffective for image editing. Our experiments further show that BDIA-DDIM\nproduces markedly better image sampling qualities than DDIM for text-to-image\ngeneration.\n  BDIA can also be applied to improve the performance of other ODE solvers in\naddition to DDIM. In our work, it is found that applying BDIA to the EDM\nsampling procedure produces new SOTA performance over CIFAR10.\n","authors":["Guoqiang Zhang","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2307.10829v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.11328. Our code is\n  available at https://github.com/guoqiang-zhang-x/BDIA"},{"id":"http://arxiv.org/abs/2303.15564v2","updated":"2023-10-02T15:33:54Z","published":"2023-03-27T19:23:33Z","title":"Mask and Restore: Blind Backdoor Defense at Test Time with Masked\n  Autoencoder","summary":"  Deep neural networks are vulnerable to backdoor attacks, where an adversary\nmaliciously manipulates the model behavior through overlaying images with\nspecial triggers. Existing backdoor defense methods often require accessing a\nfew validation data and model parameters, which are impractical in many\nreal-world applications, e.g., when the model is provided as a cloud service.\nIn this paper, we address the practical task of blind backdoor defense at test\ntime, in particular for black-box models. The true label of every test image\nneeds to be recovered on the fly from a suspicious model regardless of image\nbenignity. We focus on test-time image purification methods that incapacitate\npossible triggers while keeping semantic contents intact. Due to diverse\ntrigger patterns and sizes, the heuristic trigger search in image space can be\nunscalable. We circumvent such barrier by leveraging the strong reconstruction\npower of generative models, and propose a framework of Blind Defense with\nMasked AutoEncoder (BDMAE). It detects possible triggers in the token space\nusing image structural similarity and label consistency between the test image\nand MAE restorations. The detection results are then refined by considering\ntrigger topology. Finally, we fuse MAE restorations adaptively into a purified\nimage for making prediction. Our approach is blind to the model architectures,\ntrigger patterns and image benignity. Extensive experiments under different\nbackdoor settings validate its effectiveness and generalizability. Code is\navailable at https://github.com/tsun/BDMAE.\n","authors":["Tao Sun","Lu Pang","Chao Chen","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2303.15564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11342v3","updated":"2023-10-02T14:57:11Z","published":"2023-07-21T04:15:02Z","title":"Tuning Pre-trained Model via Moment Probing","summary":"  Recently, efficient fine-tuning of large-scale pre-trained models has\nattracted increasing research interests, where linear probing (LP) as a\nfundamental module is involved in exploiting the final representations for\ntask-dependent classification. However, most of the existing methods focus on\nhow to effectively introduce a few of learnable parameters, and little work\npays attention to the commonly used LP module. In this paper, we propose a\nnovel Moment Probing (MP) method to further explore the potential of LP.\nDistinguished from LP which builds a linear classification head based on the\nmean of final features (e.g., word tokens for ViT) or classification tokens,\nour MP performs a linear classifier on feature distribution, which provides the\nstronger representation ability by exploiting richer statistical information\ninherent in features. Specifically, we represent feature distribution by its\ncharacteristic function, which is efficiently approximated by using first- and\nsecond-order moments of features. Furthermore, we propose a multi-head\nconvolutional cross-covariance (MHC$^3$) to compute second-order moments in an\nefficient and effective manner. By considering that MP could affect feature\nlearning, we introduce a partially shared module to learn two recalibrating\nparameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive\nexperiments on ten benchmarks using various models show that our MP\nsignificantly outperforms LP and is competitive with counterparts at less\ntraining cost, while our MP$_{+}$ achieves state-of-the-art performance.\n","authors":["Mingze Gao","Qilong Wang","Zhenyi Lin","Pengfei Zhu","Qinghua Hu","Jingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11342v3.pdf","comment":"Accepted to ICCV 2023; Project Page:\n  https://github.com/mingzeG/Moment-Probing"},{"id":"http://arxiv.org/abs/2304.03897v2","updated":"2023-10-02T14:48:45Z","published":"2023-04-08T03:14:19Z","title":"Factify 2: A Multimodal Fake News and Satire News Dataset","summary":"  The internet gives the world an open platform to express their views and\nshare their stories. While this is very valuable, it makes fake news one of our\nsociety's most pressing problems. Manual fact checking process is time\nconsuming, which makes it challenging to disprove misleading assertions before\nthey cause significant harm. This is he driving interest in automatic fact or\nclaim verification. Some of the existing datasets aim to support development of\nautomating fact-checking techniques, however, most of them are text based.\nMulti-modal fact verification has received relatively scant attention. In this\npaper, we provide a multi-modal fact-checking dataset called FACTIFY 2,\nimproving Factify 1 by using new data sources and adding satire articles.\nFactify 2 has 50,000 new data instances. Similar to FACTIFY 1.0, we have three\nbroad categories - support, no-evidence, and refute, with sub-categories based\non the entailment of visual and textual data. We also provide a BERT and Vison\nTransformer based baseline, which achieves 65% F1 score in the test set. The\nbaseline codes and the dataset will be made available at\nhttps://github.com/surya1701/Factify-2.0.\n","authors":["S Suryavardan","Shreyash Mishra","Parth Patwa","Megha Chakraborty","Anku Rani","Aishwarya Reganti","Aman Chadha","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2304.03897v2.pdf","comment":"Defactify2 @AAAI2023"},{"id":"http://arxiv.org/abs/2309.07915v2","updated":"2023-10-02T14:46:01Z","published":"2023-09-14T17:59:17Z","title":"MMICL: Empowering Vision-language Model with Multi-Modal In-Context\n  Learning","summary":"  Since the resurgence of deep learning, vision-language models (VLMs) enhanced\nby large language models (LLMs) have grown exponentially in popularity.\nHowever, while LLMs can utilize extensive background knowledge and task\ninformation with in-context learning, most VLMs still struggle with\nunderstanding complex multi-modal prompts with multiple images, making VLMs\nless effective in downstream vision-language tasks. In this paper, we address\nthe limitation above by 1) introducing MMICL, a new approach to allow the VLM\nto deal with multi-modal inputs efficiently; 2) proposing a novel context\nscheme to augment the in-context learning ability of the VLM; 3) constructing\nthe Multi-modal In-Context Learning (MIC) dataset, designed to enhance the\nVLM's ability to understand complex multi-modal prompts. Our experiments\nconfirm that MMICL achieves new state-of-the-art zero-shot performance on a\nwide range of general vision-language tasks, especially for complex benchmarks,\nincluding MME and MMBench. Our analysis demonstrates that MMICL effectively\ntackles the challenge of complex multi-modal prompt understanding and emerges\nthe impressive ICL ability. Furthermore, we observe that MMICL successfully\nalleviates language bias in VLMs, a common issue for VLMs that often leads to\nhallucination when faced with extensive textual context.\n","authors":["Haozhe Zhao","Zefan Cai","Shuzheng Si","Xiaojian Ma","Kaikai An","Liang Chen","Zixuan Liu","Sheng Wang","Wenjuan Han","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2309.07915v2.pdf","comment":"Code, dataset, checkpoints, and demos are available at\n  https://github.com/PKUnlp-icler/MIC"},{"id":"http://arxiv.org/abs/2303.13227v2","updated":"2023-10-02T13:36:36Z","published":"2023-03-23T12:48:47Z","title":"Confidence-Aware and Self-Supervised Image Anomaly Localisation","summary":"  Universal anomaly detection still remains a challenging problem in machine\nlearning and medical image analysis. It is possible to learn an expected\ndistribution from a single class of normative samples, e.g., through epistemic\nuncertainty estimates, auto-encoding models, or from synthetic anomalies in a\nself-supervised way. The performance of self-supervised anomaly detection\napproaches is still inferior compared to methods that use examples from known\nunknown classes to shape the decision boundary. However, outlier exposure\nmethods often do not identify unknown unknowns. Here we discuss an improved\nself-supervised single-class training strategy that supports the approximation\nof probabilistic inference with loosen feature locality constraints. We show\nthat up-scaling of gradients with histogram-equalised images is beneficial for\nrecently proposed self-supervision tasks. Our method is integrated into several\nout-of-distribution (OOD) detection models and we show evidence that our method\noutperforms the state-of-the-art on various benchmark datasets.\n","authors":["Johanna P. Müller","Matthew Baugh","Jeremy Tan","Mischa Dombrowski","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2303.13227v2.pdf","comment":"Accepted for MICCAI UNSURE Workshop 2023 (Spotlight)"},{"id":"http://arxiv.org/abs/2304.12317v2","updated":"2023-10-02T13:07:37Z","published":"2023-04-24T17:59:52Z","title":"Total-Recon: Deformable Scene Reconstruction for Embodied View Synthesis","summary":"  We explore the task of embodied view synthesis from monocular videos of\ndeformable scenes. Given a minute-long RGBD video of people interacting with\ntheir pets, we render the scene from novel camera trajectories derived from the\nin-scene motion of actors: (1) egocentric cameras that simulate the point of\nview of a target actor and (2) 3rd-person cameras that follow the actor.\nBuilding such a system requires reconstructing the root-body and articulated\nmotion of every actor, as well as a scene representation that supports\nfree-viewpoint synthesis. Longer videos are more likely to capture the scene\nfrom diverse viewpoints (which helps reconstruction) but are also more likely\nto contain larger motions (which complicates reconstruction). To address these\nchallenges, we present Total-Recon, the first method to photorealistically\nreconstruct deformable scenes from long monocular RGBD videos. Crucially, to\nscale to long videos, our method hierarchically decomposes the scene into the\nbackground and objects, whose motion is decomposed into carefully initialized\nroot-body motion and local articulations. To quantify such \"in-the-wild\"\nreconstruction and view synthesis, we collect ground-truth data from a\nspecialized stereo RGBD capture rig for 11 challenging videos, significantly\noutperforming prior methods. Our code, model, and data can be found at\nhttps://andrewsonga.github.io/totalrecon .\n","authors":["Chonghyuk Song","Gengshan Yang","Kangle Deng","Jun-Yan Zhu","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2304.12317v2.pdf","comment":"ICCV 2023 camera-ready version. Project page with code, models, and\n  data: https://andrewsonga.github.io/totalrecon"},{"id":"http://arxiv.org/abs/2307.10922v2","updated":"2023-10-02T12:57:16Z","published":"2023-07-20T14:47:50Z","title":"Language-based Action Concept Spaces Improve Video Self-Supervised\n  Learning","summary":"  Recent contrastive language image pre-training has led to learning highly\ntransferable and robust image representations. However, adapting these models\nto video domains with minimal supervision remains an open problem. We explore a\nsimple step in that direction, using language tied self-supervised learning to\nadapt an image CLIP model to the video domain. A backbone modified for temporal\nmodeling is trained under self-distillation settings with train objectives\noperating in an action concept space. Feature vectors of various action\nconcepts extracted from a language encoder using relevant textual prompts\nconstruct this space. We introduce two train objectives, concept distillation\nand concept alignment, that retain generality of original representations while\nenforcing relations between actions and their attributes. Our approach improves\nzero-shot and linear probing performance on three action recognition\nbenchmarks.\n","authors":["Kanchana Ranasinghe","Michael Ryoo"],"pdf_url":"https://arxiv.org/pdf/2307.10922v2.pdf","comment":"Presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2304.10864v2","updated":"2023-10-02T12:40:34Z","published":"2023-04-21T10:23:34Z","title":"FreMIM: Fourier Transform Meets Masked Image Modeling for Medical Image\n  Segmentation","summary":"  The research community has witnessed the powerful potential of\nself-supervised Masked Image Modeling (MIM), which enables the models capable\nof learning visual representation from unlabeled data.In this paper, to\nincorporate both the crucial global structural information and local details\nfor dense prediction tasks, we alter the perspective to the frequency domain\nand present a new MIM-based framework named FreMIM for self-supervised\npre-training to better accomplish medical image segmentation task. Based on the\nobservations that the detailed structural information mainly lies in the\nhigh-frequency components and the high-level semantics are abundant in the\nlow-frequency counterparts, we further incorporate multi-stage supervision to\nguide the representation learning during the pre-training phase. Extensive\nexperiments on three benchmark datasets show the superior advantage of our\nFreMIM over previous state-of-the-art MIM methods. Compared with various\nbaselines trained from scratch, our FreMIM could consistently bring\nconsiderable improvements to model performance. The code will be made publicly\navailable.\n","authors":["Wenxuan Wang","Jing Wang","Chen Chen","Jianbo Jiao","Lichao Sun","Yuanxiu Cai","Shanshan Song","Jiangyun Li"],"pdf_url":"https://arxiv.org/pdf/2304.10864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06930v2","updated":"2023-10-02T11:58:10Z","published":"2023-07-13T17:51:58Z","title":"mBLIP: Efficient Bootstrapping of Multilingual Vision-LLMs","summary":"  Modular vision-language models (Vision-LLMs) align pretrained image encoders\nwith frozen large language models (LLMs), representing a computationally much\nmore efficient alternative to end-to-end training of large vision-language\nmodels from scratch, which is prohibitively expensive for most researchers and\npractitioners. Vision-LLMs instead post-hoc condition LLMs to `understand' the\noutput of an image encoder. With the abundance of readily available\nhigh-quality English image-text data as well as monolingual English LLMs, the\nresearch focus has been on English-only Vision-LLMs. Multilingual\nvision-language models are still predominantly obtained via expensive\nend-to-end pretraining, resulting in comparatively smaller models, trained on\nlimited multilingual image data supplemented with text-only multilingual\ncorpora. In this work, we present mBLIP, the first multilingual Vision-LLM,\nwhich we obtain in a computationally efficient manner -- on consumer hardware\nand using only a few million training examples -- by leveraging a pretrained\nmultilingual LLM. To this end, we \\textit{re-align} an image encoder previously\ntuned to an English LLM to a new, multilingual LLM -- for this, we leverage\nmultilingual data from a mix of vision-and-language tasks, which we obtain by\nmachine-translating high-quality English data to 95 languages. On the IGLUE\nbenchmark, mBLIP yields results competitive with state-of-the-art models.\nMoreover, in image captioning on XM3600, mBLIP (zero-shot) even outperforms\nPaLI-X (a model with 55B parameters). Compared to these very large multilingual\nvision-language models trained from scratch, we obtain mBLIP by training orders\nof magnitude fewer parameters on magnitudes less data. We release our model and\ncode at \\url{https://github.com/gregor-ge/mBLIP}.\n","authors":["Gregor Geigle","Abhay Jain","Radu Timofte","Goran Glavaš"],"pdf_url":"https://arxiv.org/pdf/2307.06930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05673v2","updated":"2023-10-02T11:57:04Z","published":"2023-04-12T07:49:21Z","title":"Precise localization of corneal reflections in eye images using deep\n  learning trained on synthetic data","summary":"  We present a deep learning method for accurately localizing the center of a\nsingle corneal reflection (CR) in an eye image. Unlike previous approaches, we\nuse a convolutional neural network (CNN) that was trained solely using\nsimulated data. Using only simulated data has the benefit of completely\nsidestepping the time-consuming process of manual annotation that is required\nfor supervised training on real eye images. To systematically evaluate the\naccuracy of our method, we first tested it on images with simulated CRs placed\non different backgrounds and embedded in varying levels of noise. Second, we\ntested the method on high-quality videos captured from real eyes. Our method\noutperformed state-of-the-art algorithmic methods on real eye images with a 35%\nreduction in terms of spatial precision, and performed on par with\nstate-of-the-art on simulated images in terms of spatial accuracy.We conclude\nthat our method provides a precise method for CR center localization and\nprovides a solution to the data availability problem which is one of the\nimportant common roadblocks in the development of deep learning models for gaze\nestimation. Due to the superior CR center localization and ease of application,\nour method has the potential to improve the accuracy and precision of CR-based\neye trackers\n","authors":["Sean Anthony Byrne","Marcus Nyström","Virmarie Maquiling","Enkelejda Kasneci","Diederick C. Niehorster"],"pdf_url":"https://arxiv.org/pdf/2304.05673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12706v4","updated":"2023-10-02T11:04:23Z","published":"2023-03-16T09:14:48Z","title":"Multi-modal Variational Autoencoders for normative modelling across\n  multiple imaging modalities","summary":"  One of the challenges of studying common neurological disorders is disease\nheterogeneity including differences in causes, neuroimaging characteristics,\ncomorbidities, or genetic variation. Normative modelling has become a popular\nmethod for studying such cohorts where the 'normal' behaviour of a\nphysiological system is modelled and can be used at subject level to detect\ndeviations relating to disease pathology. For many heterogeneous diseases, we\nexpect to observe abnormalities across a range of neuroimaging and biological\nvariables. However, thus far, normative models have largely been developed for\nstudying a single imaging modality. We aim to develop a multi-modal normative\nmodelling framework where abnormality is aggregated across variables of\nmultiple modalities and is better able to detect deviations than uni-modal\nbaselines. We propose two multi-modal VAE normative models to detect subject\nlevel deviations across T1 and DTI data. Our proposed models were better able\nto detect diseased individuals, capture disease severity, and correlate with\npatient cognition than baseline approaches. We also propose a multivariate\nlatent deviation metric, measuring deviations from the joint latent space,\nwhich outperformed feature-based metrics.\n","authors":["Ana Lawry Aguila","James Chapman","Andre Altmann"],"pdf_url":"https://arxiv.org/pdf/2303.12706v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16512v2","updated":"2023-10-02T10:42:28Z","published":"2023-08-31T07:49:06Z","title":"MVDream: Multi-view Diffusion for 3D Generation","summary":"  We introduce MVDream, a multi-view diffusion model that is able to generate\nconsistent multi-view images from a given text prompt. Learning from both 2D\nand 3D data, a multi-view diffusion model can achieve the generalizability of\n2D diffusion models and the consistency of 3D renderings. We demonstrate that\nsuch a multi-view prior can serve as a generalizable 3D prior that is agnostic\nto 3D representations. It can be applied to 3D generation via Score\nDistillation Sampling, significantly enhancing the consistency and stability of\nexisting 2D-lifting methods. It can also learn new concepts from a few 2D\nexamples, akin to DreamBooth, but for 3D generation.\n","authors":["Yichun Shi","Peng Wang","Jianglong Ye","Mai Long","Kejie Li","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16512v2.pdf","comment":"Our project page is https://MV-Dream.github.io"},{"id":"http://arxiv.org/abs/2309.15523v3","updated":"2023-10-02T10:39:07Z","published":"2023-09-27T09:41:36Z","title":"Improving Facade Parsing with Vision Transformers and Line Integration","summary":"  Facade parsing stands as a pivotal computer vision task with far-reaching\napplications in areas like architecture, urban planning, and energy efficiency.\nDespite the recent success of deep learning-based methods in yielding\nimpressive results on certain open-source datasets, their viability for\nreal-world applications remains uncertain. Real-world scenarios are\nconsiderably more intricate, demanding greater computational efficiency.\nExisting datasets often fall short in representing these settings, and previous\nmethods frequently rely on extra models to enhance accuracy, which requires\nmuch computation cost. In this paper, we introduce Comprehensive Facade Parsing\n(CFP), a dataset meticulously designed to encompass the intricacies of\nreal-world facade parsing tasks. Comprising a total of 602 high-resolution\nstreet-view images, this dataset captures a diverse array of challenging\nscenarios, including sloping angles and densely clustered buildings, with\npainstakingly curated annotations for each image. We introduce a new pipeline\nknown as Revision-based Transformer Facade Parsing (RTFP). This marks the\npioneering utilization of Vision Transformers (ViT) in facade parsing, and our\nexperimental results definitively substantiate its merit. We also design Line\nAcquisition, Filtering, and Revision (LAFR), an efficient yet accurate revision\nalgorithm that can improve the segment result solely from simple line detection\nusing prior knowledge of the facade. In ECP 2011, RueMonge 2014, and our CFP,\nwe evaluate the superiority of our method. The dataset and code are available\nat https://github.com/wbw520/RTFP.\n","authors":["Bowen Wang","Jiaxing Zhang","Ran Zhang","Yunqin Li","Liangzhi Li","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2309.15523v3.pdf","comment":"13 pages, 7 figures, 9 tables"},{"id":"http://arxiv.org/abs/2309.17036v2","updated":"2023-10-02T09:50:47Z","published":"2023-09-29T07:50:09Z","title":"UniQuadric: A SLAM Backend for Unknown Rigid Object 3D Tracking and\n  Light-Weight Modeling","summary":"  Tracking and modeling unknown rigid objects in the environment play a crucial\nrole in autonomous unmanned systems and virtual-real interactive applications.\nHowever, many existing Simultaneous Localization, Mapping and Moving Object\nTracking (SLAMMOT) methods focus solely on estimating specific object poses and\nlack estimation of object scales and are unable to effectively track unknown\nobjects. In this paper, we propose a novel SLAM backend that unifies ego-motion\ntracking, rigid object motion tracking, and modeling within a joint\noptimization framework. In the perception part, we designed a pixel-level\nasynchronous object tracker (AOT) based on the Segment Anything Model (SAM) and\nDeAOT, enabling the tracker to effectively track target unknown objects guided\nby various predefined tasks and prompts. In the modeling part, we present a\nnovel object-centric quadric parameterization to unify both static and dynamic\nobject initialization and optimization. Subsequently, in the part of object\nstate estimation, we propose a tightly coupled optimization model for object\npose and scale estimation, incorporating hybrids constraints into a novel dual\nsliding window optimization framework for joint estimation. To our knowledge,\nwe are the first to tightly couple object pose tracking with light-weight\nmodeling of dynamic and static objects using quadric. We conduct qualitative\nand quantitative experiments on simulation datasets and real-world datasets,\ndemonstrating the state-of-the-art robustness and accuracy in motion estimation\nand modeling. Our system showcases the potential application of object\nperception in complex dynamic scenes.\n","authors":["Linghao Yang","Yanmin Wu","Yu Deng","Rui Tian","Xinggang Hu","Tiefeng Ma"],"pdf_url":"https://arxiv.org/pdf/2309.17036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09874v2","updated":"2023-10-02T09:40:54Z","published":"2023-03-17T10:38:27Z","title":"Disentangling the Link Between Image Statistics and Human Perception","summary":"  In the 1950s, Barlow and Attneave hypothesised a link between biological\nvision and information maximisation. Following Shannon, information was defined\nusing the probability of natural images. A number of physiological and\npsychophysical phenomena have been derived ever since from principles like\ninfo-max, efficient coding, or optimal denoising. However, it remains unclear\nhow this link is expressed in mathematical terms from image probability. First,\nclassical derivations were subjected to strong assumptions on the probability\nmodels and on the behaviour of the sensors. Moreover, the direct evaluation of\nthe hypothesis was limited by the inability of the classical image models to\ndeliver accurate estimates of the probability. In this work we directly\nevaluate image probabilities using an advanced generative model for natural\nimages, and we analyse how probability-related factors can be combined to\npredict human perception via sensitivity of state-of-the-art subjective image\nquality metrics. We use information theory and regression analysis to find a\ncombination of just two probability-related factors that achieves 0.8\ncorrelation with subjective metrics. This probability-based sensitivity is\npsychophysically validated by reproducing the basic trends of the Contrast\nSensitivity Function, its suprathreshold variation, and trends of the Weber-law\nand masking.\n","authors":["Alexander Hepburn","Valero Laparra","Raúl Santos-Rodriguez","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2303.09874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17343v2","updated":"2023-10-02T08:34:54Z","published":"2023-05-27T02:57:39Z","title":"Modality-Independent Teachers Meet Weakly-Supervised Audio-Visual Event\n  Parser","summary":"  Audio-visual learning has been a major pillar of multi-modal machine\nlearning, where the community mostly focused on its modality-aligned setting,\ni.e., the audio and visual modality are both assumed to signal the prediction\ntarget. With the Look, Listen, and Parse dataset (LLP), we investigate the\nunder-explored unaligned setting, where the goal is to recognize audio and\nvisual events in a video with only weak labels observed. Such weak video-level\nlabels only tell what events happen without knowing the modality they are\nperceived (audio, visual, or both). To enhance learning in this challenging\nsetting, we incorporate large-scale contrastively pre-trained models as the\nmodality teachers. A simple, effective, and generic method, termed Visual-Audio\nLabel Elaboration (VALOR), is innovated to harvest modality labels for the\ntraining events. Empirical studies show that the harvested labels significantly\nimprove an attentional baseline by 8.0 in average F-score (Type@AV).\nSurprisingly, we found that modality-independent teachers outperform their\nmodality-fused counterparts since they are noise-proof from the other\npotentially unaligned modality. Moreover, our best model achieves the new\nstate-of-the-art on all metrics of LLP by a substantial margin (+5.4 F-score\nfor Type@AV). VALOR is further generalized to Audio-Visual Event Localization\nand achieves the new state-of-the-art as well. Code is available at:\nhttps://github.com/Franklin905/VALOR.\n","authors":["Yung-Hsuan Lai","Yen-Chun Chen","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2305.17343v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2304.00186v5","updated":"2023-10-02T08:08:45Z","published":"2023-04-01T00:47:35Z","title":"Subject-driven Text-to-Image Generation via Apprenticeship Learning","summary":"  Recent text-to-image generation models like DreamBooth have made remarkable\nprogress in generating highly customized images of a target subject, by\nfine-tuning an ``expert model'' for a given subject from a few examples.\nHowever, this process is expensive, since a new expert model must be learned\nfor each subject. In this paper, we present SuTI, a Subject-driven\nText-to-Image generator that replaces subject-specific fine tuning with\nin-context learning. Given a few demonstrations of a new subject, SuTI can\ninstantly generate novel renditions of the subject in different scenes, without\nany subject-specific optimization. SuTI is powered by apprenticeship learning,\nwhere a single apprentice model is learned from data generated by a massive\nnumber of subject-specific expert models. Specifically, we mine millions of\nimage clusters from the Internet, each centered around a specific visual\nsubject. We adopt these clusters to train a massive number of expert models,\neach specializing in a different subject. The apprentice model SuTI then learns\nto imitate the behavior of these fine-tuned experts. SuTI can generate\nhigh-quality and customized subject-specific images 20x faster than\noptimization-based SoTA methods. On the challenging DreamBench and\nDreamBench-v2, our human evaluation shows that SuTI significantly outperforms\nexisting models like InstructPix2Pix, Textual Inversion, Imagic, Prompt2Prompt,\nRe-Imagen and DreamBooth, especially on the subject and text alignment aspects.\n","authors":["Wenhu Chen","Hexiang Hu","Yandong Li","Nataniel Ruiz","Xuhui Jia","Ming-Wei Chang","William W. Cohen"],"pdf_url":"https://arxiv.org/pdf/2304.00186v5.pdf","comment":"Accepted at NeurIPS 2023. Model Service to be appear as Google Vertex\n  AI - Instant Tuning\n  (https://cloud.google.com/vertex-ai/docs/generative-ai/image/fine-tune-model).\n  The link to demo video:\n  https://www.youtube.com/watch?v=Q2xQ91D_dhM&t=2071s&ab_channel=GoogleCloud"},{"id":"http://arxiv.org/abs/2302.13293v2","updated":"2023-10-02T07:37:56Z","published":"2023-02-26T11:02:34Z","title":"PDIWS: Thermal Imaging Dataset for Person Detection in Intrusion Warning\n  Systems","summary":"  In this paper, we present a synthetic thermal imaging dataset for Person\nDetection in Intrusion Warning Systems (PDIWS). The dataset consists of a\ntraining set with 2000 images and a test set with 500 images. Each image is\nsynthesized by compounding a subject (intruder) with a background using the\nmodified Poisson image editing method. There are a total of 50 different\nbackgrounds and nearly 1000 subjects divided into five classes according to\nfive human poses: creeping, crawling, stooping, climbing and other. The\npresence of the intruder will be confirmed if the first four poses are\ndetected. Advanced object detection algorithms have been implemented with this\ndataset and give relatively satisfactory results, with the highest mAP values\nof 95.5% and 90.9% for IoU of 0.5 and 0.75 respectively. The dataset is freely\npublished online for research purposes at\nhttps://github.com/thuan-researcher/Intruder-Thermal-Dataset.\n","authors":["Nguyen Duc Thuan","Le Hai Anh","Hoang Si Hong"],"pdf_url":"https://arxiv.org/pdf/2302.13293v2.pdf","comment":"We are considering some issues in the paper"},{"id":"http://arxiv.org/abs/2309.16671v3","updated":"2023-10-02T07:12:53Z","published":"2023-09-28T17:59:56Z","title":"Demystifying CLIP Data","summary":"  Contrastive Language-Image Pre-training (CLIP) is an approach that has\nadvanced research and applications in computer vision, fueling modern\nrecognition systems and generative models. We believe that the main ingredient\nto the success of CLIP is its data and not the model architecture or\npre-training objective. However, CLIP only provides very limited information\nabout its data and how it has been collected, leading to works that aim to\nreproduce CLIP's data by filtering with its model parameters. In this work, we\nintend to reveal CLIP's data curation approach and in our pursuit of making it\nopen to the community introduce Metadata-Curated Language-Image Pre-training\n(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's\nconcepts) and yields a balanced subset over the metadata distribution. Our\nexperimental study rigorously isolates the model and training settings,\nconcentrating solely on data. MetaCLIP applied to CommonCrawl with 400M\nimage-text data pairs outperforms CLIP's data on multiple standard benchmarks.\nIn zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,\nsurpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining\nthe same training budget, attains 72.4%. Our observations hold across various\nmodel sizes, exemplified by ViT-H achieving 80.5%, without any\nbells-and-whistles. Curation code and training data distribution on metadata is\nmade available at https://github.com/facebookresearch/MetaCLIP.\n","authors":["Hu Xu","Saining Xie","Xiaoqing Ellen Tan","Po-Yao Huang","Russell Howes","Vasu Sharma","Shang-Wen Li","Gargi Ghosh","Luke Zettlemoyer","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2309.16671v3.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by\n  other authors"},{"id":"http://arxiv.org/abs/2303.06842v3","updated":"2023-10-02T05:05:01Z","published":"2023-03-13T04:16:42Z","title":"Hierarchical Relationships: A New Perspective to Enhance Scene Graph\n  Generation","summary":"  This paper presents a finding that leveraging the hierarchical structures\namong labels for relationships and objects can substantially improve the\nperformance of scene graph generation systems. The focus of this work is to\ncreate an informative hierarchical structure that can divide object and\nrelationship categories into disjoint super-categories in a systematic way.\nSpecifically, we introduce a Bayesian prediction head to jointly predict the\nsuper-category of relationships between a pair of object instances, as well as\nthe detailed relationship within that super-category simultaneously,\nfacilitating more informative predictions. The resulting model exhibits the\ncapability to produce a more extensive set of predicates beyond the dataset\nannotations, and to tackle the prevalent issue of low annotation quality. While\nour paper presents preliminary findings, experiments on the Visual Genome\ndataset show its strong performance, particularly in predicate classifications\nand zero-shot settings, that demonstrates the promise of our approach.\n","authors":["Bowen Jiang","Camillo J. Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.06842v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17555v2","updated":"2023-10-02T04:47:14Z","published":"2023-05-27T19:10:19Z","title":"Diffeomorphic Deformation via Sliced Wasserstein Distance Optimization\n  for Cortical Surface Reconstruction","summary":"  Mesh deformation is a core task for 3D mesh reconstruction, but defining an\nefficient discrepancy between predicted and target meshes remains an open\nproblem. A prevalent approach in current deep learning is the set-based\napproach which measures the discrepancy between two surfaces by comparing two\nrandomly sampled point-clouds from the two meshes with Chamfer pseudo-distance.\nNevertheless, the set-based approach still has limitations such as lacking a\ntheoretical guarantee for choosing the number of points in sampled\npoint-clouds, and the pseudo-metricity and the quadratic complexity of the\nChamfer divergence. To address these issues, we propose a novel metric for\nlearning mesh deformation. The metric is defined by sliced Wasserstein distance\non meshes represented as probability measures that generalize the set-based\napproach. By leveraging probability measure space, we gain flexibility in\nencoding meshes using diverse forms of probability measures, such as\ncontinuous, empirical, and discrete measures via \\textit{varifold}\nrepresentation. After having encoded probability measures, we can compare\nmeshes by using the sliced Wasserstein distance which is an effective optimal\ntransport distance with linear computational complexity and can provide a fast\nstatistical rate for approximating the surface of meshes. Furthermore, we\nemploy a neural ordinary differential equation (ODE) to deform the input\nsurface into the target shape by modeling the trajectories of the points on the\nsurface. Our experiments on cortical surface reconstruction demonstrate that\nour approach surpasses other competing methods in multiple datasets and\nmetrics.\n","authors":["Tung Le","Khai Nguyen","Shanlin Sun","Kun Han","Nhat Ho","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2305.17555v2.pdf","comment":"Update experimental results"},{"id":"http://arxiv.org/abs/2303.09792v2","updated":"2023-10-02T03:41:35Z","published":"2023-03-17T06:26:55Z","title":"Exploring Sparse Visual Prompt for Domain Adaptive Dense Prediction","summary":"  The visual prompts have provided an efficient manner in addressing visual\ncross-domain problems. In previous works, Visual Domain Prompt (VDP) first\nintroduces domain prompts to tackle the classification Test-Time Adaptation\n(TTA) problem by warping image-level prompts on the input and fine-tuning\nprompts for each target domain. However, since the image-level prompts mask out\ncontinuous spatial details in the prompt-allocated region, it will suffer from\ninaccurate contextual information and limited domain knowledge extraction,\nparticularly when dealing with dense prediction TTA problems. To overcome these\nchallenges, we propose a novel Sparse Visual Domain Prompts (SVDP) approach,\nwhich holds minimal trainable parameters (e.g., 0.1\\%) in the image-level\nprompt and reserves more spatial information of the input. To better apply SVDP\nin extracting domain-specific knowledge, we introduce the Domain Prompt\nPlacement (DPP) method to adaptively allocates trainable parameters of SVDP on\nthe pixels with large distribution shifts. Furthermore, recognizing that each\ntarget domain sample exhibits a unique domain shift, we design Domain Prompt\nUpdating (DPU) strategy to optimize prompt parameters differently for each\nsample, facilitating efficient adaptation to the target domain. Extensive\nexperiments were conducted on widely-used TTA and continual TTA benchmarks, and\nour proposed method achieves state-of-the-art performance in both semantic\nsegmentation and depth estimation tasks.\n","authors":["Senqiao Yang","Jiarui Wu","Jiaming Liu","Xiaoqi Li","Qizhe Zhang","Mingjie Pan","Yulu Gan","Zehui Chen","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.09792v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16463v2","updated":"2023-10-02T03:31:17Z","published":"2023-08-31T05:15:27Z","title":"Sparkles: Unlocking Chats Across Multiple Images for Multimodal\n  Instruction-Following Models","summary":"  Large language models exhibit enhanced zero-shot performance on various tasks\nwhen fine-tuned with instruction-following data. Multimodal\ninstruction-following models extend these capabilities by integrating both text\nand images. However, existing models such as MiniGPT-4 face challenges in\nmaintaining dialogue coherence in scenarios involving multiple images. A\nprimary reason is the lack of a specialized dataset for this critical\napplication. To bridge these gaps, we present SparklesChat, a multimodal\ninstruction-following model for open-ended dialogues across multiple images. To\nsupport the training, we introduce SparklesDialogue, the first\nmachine-generated dialogue dataset tailored for word-level interleaved\nmulti-image and text interactions. Furthermore, we construct SparklesEval, a\nGPT-assisted benchmark for quantitatively assessing a model's conversational\ncompetence across multiple images and dialogue turns. Our experiments validate\nthe effectiveness of SparklesChat in understanding and reasoning across\nmultiple images and dialogue turns. Specifically, SparklesChat outperformed\nMiniGPT-4 on established vision-and-language benchmarks, including the BISON\nbinary image selection task and the NLVR2 visual reasoning task. Moreover,\nSparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding\nMiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative\nevaluations further demonstrate SparklesChat's generality in handling\nreal-world applications. All resources are available at\nhttps://github.com/HYPJUDY/Sparkles.\n","authors":["Yupan Huang","Zaiqiao Meng","Fangyu Liu","Yixuan Su","Nigel Collier","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16463v2.pdf","comment":"Reduced main content to 9 pages; typos corrected"},{"id":"http://arxiv.org/abs/2309.14709v3","updated":"2023-10-02T02:37:36Z","published":"2023-09-26T07:04:47Z","title":"Bootstrap Diffusion Model Curve Estimation for High Resolution Low-Light\n  Image Enhancement","summary":"  Learning-based methods have attracted a lot of research attention and led to\nsignificant improvements in low-light image enhancement. However, most of them\nstill suffer from two main problems: expensive computational cost in high\nresolution images and unsatisfactory performance in simultaneous enhancement\nand denoising. To address these problems, we propose BDCE, a bootstrap\ndiffusion model that exploits the learning of the distribution of the curve\nparameters instead of the normal-light image itself. Specifically, we adopt the\ncurve estimation method to handle the high-resolution images, where the curve\nparameters are estimated by our bootstrap diffusion model. In addition, a\ndenoise module is applied in each iteration of curve adjustment to denoise the\nintermediate enhanced result of each iteration. We evaluate BDCE on commonly\nused benchmark datasets, and extensive experiments show that it achieves\nstate-of-the-art qualitative and quantitative performance.\n","authors":["Jiancheng Huang","Yifan Liu","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2309.14709v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00997v2","updated":"2023-10-02T02:32:03Z","published":"2023-07-03T13:21:58Z","title":"RefSAM: Efficiently Adapting Segmenting Anything Model for Referring\n  Video Object Segmentation","summary":"  The Segment Anything Model (SAM) has gained significant attention for its\nimpressive performance in image segmentation. However, it lacks proficiency in\nreferring video object segmentation (RVOS) due to the need for precise\nuser-interactive prompts and a limited understanding of different modalities,\nsuch as language and vision. This paper presents the RefSAM model, which\nexplores the potential of SAM for RVOS by incorporating multi-view information\nfrom diverse modalities and successive frames at different timestamps in an\nonline manner. Our proposed approach adapts the original SAM model to enhance\ncross-modality learning by employing a lightweight Cross-Modal MLP that\nprojects the text embedding of the referring expression into sparse and dense\nembeddings, serving as user-interactive prompts. Additionally, we have\nintroduced the hierarchical dense attention module to fuse hierarchical visual\nsemantic information with sparse embeddings in order to obtain fine-grained\ndense embeddings, and an implicit tracking module to generate a track token and\nprovide historical information for the mask decoder. Furthermore, we employ a\nparameter-efficient tuning strategy to effectively align and fuse the language\nand vision features. Through comprehensive ablation studies, we demonstrate the\npractical and effective design choices of our model. Extensive experiments\nconducted on Ref-Youtu-VOS, Ref-DAVIS17, and three referring image segmentation\ndatasets validate the superiority and effectiveness of our RefSAM model over\nexisting methods. The code and models will be made publicly at\n\\href{https://github.com/LancasterLi/RefSAM}{github.com/LancasterLi/RefSAM}.\n","authors":["Yonglin Li","Jing Zhang","Xiao Teng","Long Lan"],"pdf_url":"https://arxiv.org/pdf/2307.00997v2.pdf","comment":"The code and models will be made publicly at\n  https://github.com/LancasterLi/RefSAM"},{"id":"http://arxiv.org/abs/2309.17444v2","updated":"2023-10-02T01:46:44Z","published":"2023-09-29T17:54:46Z","title":"LLM-grounded Video Diffusion Models","summary":"  Text-conditioned diffusion models have emerged as a promising tool for neural\nvideo generation. However, current models still struggle with intricate\nspatiotemporal prompts and often generate restricted or incorrect motion (e.g.,\neven lacking the ability to be prompted for objects moving from left to right).\nTo address these limitations, we introduce LLM-grounded Video Diffusion (LVD).\nInstead of directly generating videos from the text inputs, LVD first leverages\na large language model (LLM) to generate dynamic scene layouts based on the\ntext inputs and subsequently uses the generated layouts to guide a diffusion\nmodel for video generation. We show that LLMs are able to understand complex\nspatiotemporal dynamics from text alone and generate layouts that align closely\nwith both the prompts and the object motion patterns typically observed in the\nreal world. We then propose to guide video diffusion models with these layouts\nby adjusting the attention maps. Our approach is training-free and can be\nintegrated into any video diffusion model that admits classifier guidance. Our\nresults demonstrate that LVD significantly outperforms its base video diffusion\nmodel and several strong baseline methods in faithfully generating videos with\nthe desired attributes and motion patterns.\n","authors":["Long Lian","Baifeng Shi","Adam Yala","Trevor Darrell","Boyi Li"],"pdf_url":"https://arxiv.org/pdf/2309.17444v2.pdf","comment":"Project Page: https://llm-grounded-video-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2310.01701v1","updated":"2023-10-02T23:38:17Z","published":"2023-10-02T23:38:17Z","title":"Transcending Domains through Text-to-Image Diffusion: A Source-Free\n  Approach to Domain Adaptation","summary":"  Domain Adaptation (DA) is a method for enhancing a model's performance on a\ntarget domain with inadequate annotated data by applying the information the\nmodel has acquired from a related source domain with sufficient labeled data.\nThe escalating enforcement of data-privacy regulations like HIPAA, COPPA,\nFERPA, etc. have sparked a heightened interest in adapting models to novel\ndomains while circumventing the need for direct access to the source data, a\nproblem known as Source-Free Domain Adaptation (SFDA). In this paper, we\npropose a novel framework for SFDA that generates source data using a\ntext-to-image diffusion model trained on the target domain samples. Our method\nstarts by training a text-to-image diffusion model on the labeled target domain\nsamples, which is then fine-tuned using the pre-trained source model to\ngenerate samples close to the source data. Finally, we use Domain Adaptation\ntechniques to align the artificially generated source data with the target\ndomain data, resulting in significant performance improvements of the model on\nthe target domain. Through extensive comparison against several baselines on\nthe standard Office-31, Office-Home, and VisDA benchmarks, we demonstrate the\neffectiveness of our approach for the SFDA task.\n","authors":["Shivang Chopra","Suraj Kothawade","Houda Aynaou","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2310.01701v1.pdf","comment":"9 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.09520v2","updated":"2023-10-02T22:48:49Z","published":"2023-07-18T18:01:30Z","title":"Adversarial Bayesian Augmentation for Single-Source Domain\n  Generalization","summary":"  Generalizing to unseen image domains is a challenging problem primarily due\nto the lack of diverse training data, inaccessible target data, and the large\ndomain shift that may exist in many real-world settings. As such data\naugmentation is a critical component of domain generalization methods that seek\nto address this problem. We present Adversarial Bayesian Augmentation (ABA), a\nnovel algorithm that learns to generate image augmentations in the challenging\nsingle-source domain generalization setting. ABA draws on the strengths of\nadversarial learning and Bayesian neural networks to guide the generation of\ndiverse data augmentations -- these synthesized image domains aid the\nclassifier in generalizing to unseen domains. We demonstrate the strength of\nABA on several types of domain shift including style shift, subpopulation\nshift, and shift in the medical imaging setting. ABA outperforms all previous\nstate-of-the-art methods, including pre-specified augmentations, pixel-based\nand convolutional-based augmentations.\n","authors":["Sheng Cheng","Tejas Gokhale","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2307.09520v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2310.01680v1","updated":"2023-10-02T22:31:30Z","published":"2023-10-02T22:31:30Z","title":"Keypoint-Augmented Self-Supervised Learning for Medical Image\n  Segmentation with Limited Annotation","summary":"  Pretraining CNN models (i.e., UNet) through self-supervision has become a\npowerful approach to facilitate medical image segmentation under low annotation\nregimes. Recent contrastive learning methods encourage similar global\nrepresentations when the same image undergoes different transformations, or\nenforce invariance across different image/patch features that are intrinsically\ncorrelated. However, CNN-extracted global and local features are limited in\ncapturing long-range spatial dependencies that are essential in biological\nanatomy. To this end, we present a keypoint-augmented fusion layer that\nextracts representations preserving both short- and long-range self-attention.\nIn particular, we augment the CNN feature map at multiple scales by\nincorporating an additional input that learns long-range spatial self-attention\namong localized keypoint features. Further, we introduce both global and local\nself-supervised pretraining for the framework. At the global scale, we obtain\nglobal representations from both the bottleneck of the UNet, and by aggregating\nmultiscale keypoint features. These global features are subsequently\nregularized through image-level contrastive objectives. At the local scale, we\ndefine a distance-based criterion to first establish correspondences among\nkeypoints and encourage similarity between their features. Through extensive\nexperiments on both MRI and CT segmentation tasks, we demonstrate the\narchitectural advantages of our proposed method in comparison to both CNN and\nTransformer-based UNets, when all architectures are trained with randomly\ninitialized weights. With our proposed pretraining strategy, our method further\noutperforms existing SSL methods by producing more robust self-attention and\nachieving state-of-the-art segmentation results. The code is available at\nhttps://github.com/zshyang/kaf.git.\n","authors":["Zhangsihao Yang","Mengwei Ren","Kaize Ding","Guido Gerig","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01680v1.pdf","comment":"Camera ready for NeurIPS 2023. Code available at\n  https://github.com/zshyang/kaf.git"},{"id":"http://arxiv.org/abs/2310.01667v1","updated":"2023-10-02T21:58:32Z","published":"2023-10-02T21:58:32Z","title":"STARS: Zero-shot Sim-to-Real Transfer for Segmentation of Shipwrecks in\n  Sonar Imagery","summary":"  In this paper, we address the problem of sim-to-real transfer for object\nsegmentation when there is no access to real examples of an object of interest\nduring training, i.e. zero-shot sim-to-real transfer for segmentation. We focus\non the application of shipwreck segmentation in side scan sonar imagery. Our\nnovel segmentation network, STARS, addresses this challenge by fusing a\npredicted deformation field and anomaly volume, allowing it to generalize\nbetter to real sonar images and achieve more effective zero-shot sim-to-real\ntransfer for image segmentation. We evaluate the sim-to-real transfer\ncapabilities of our method on a real, expert-labeled side scan sonar dataset of\nshipwrecks collected from field work surveys with an autonomous underwater\nvehicle (AUV). STARS is trained entirely in simulation and performs zero-shot\nshipwreck segmentation with no additional fine-tuning on real data. Our method\nprovides a significant 20% increase in segmentation performance for the\ntargeted shipwreck class compared to the best baseline.\n","authors":["Advaith Venkatramanan Sethuraman","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2310.01667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01663v1","updated":"2023-10-02T21:52:50Z","published":"2023-10-02T21:52:50Z","title":"Task-guided Domain Gap Reduction for Monocular Depth Prediction in\n  Endoscopy","summary":"  Colorectal cancer remains one of the deadliest cancers in the world. In\nrecent years computer-aided methods have aimed to enhance cancer screening and\nimprove the quality and availability of colonoscopies by automatizing\nsub-tasks. One such task is predicting depth from monocular video frames, which\ncan assist endoscopic navigation. As ground truth depth from standard in-vivo\ncolonoscopy remains unobtainable due to hardware constraints, two approaches\nhave aimed to circumvent the need for real training data: supervised methods\ntrained on labeled synthetic data and self-supervised models trained on\nunlabeled real data. However, self-supervised methods depend on unreliable loss\nfunctions that struggle with edges, self-occlusion, and lighting inconsistency.\nMethods trained on synthetic data can provide accurate depth for synthetic\ngeometries but do not use any geometric supervisory signal from real data and\noverfit to synthetic anatomies and properties. This work proposes a novel\napproach to leverage labeled synthetic and unlabeled real data. While previous\ndomain adaptation methods indiscriminately enforce the distributions of both\ninput data modalities to coincide, we focus on the end task, depth prediction,\nand translate only essential information between the input domains. Our\napproach results in more resilient and accurate depth maps of real colonoscopy\nsequences.\n","authors":["Anita Rau","Binod Bhattarai","Lourdes Agapito","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2310.01663v1.pdf","comment":"First Data Engineering in Medical Imaging Workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2310.01662v1","updated":"2023-10-02T21:52:47Z","published":"2023-10-02T21:52:47Z","title":"SYRAC: Synthesize, Rank, and Count","summary":"  Crowd counting is a critical task in computer vision, with several important\napplications. However, existing counting methods rely on labor-intensive\ndensity map annotations, necessitating the manual localization of each\nindividual pedestrian. While recent efforts have attempted to alleviate the\nannotation burden through weakly or semi-supervised learning, these approaches\nfall short of significantly reducing the workload. We propose a novel approach\nto eliminate the annotation burden by leveraging latent diffusion models to\ngenerate synthetic data. However, these models struggle to reliably understand\nobject quantities, leading to noisy annotations when prompted to produce images\nwith a specific quantity of objects. To address this, we use latent diffusion\nmodels to create two types of synthetic data: one by removing pedestrians from\nreal images, which generates ranked image pairs with a weak but reliable object\nquantity signal, and the other by generating synthetic images with a\npredetermined number of objects, offering a strong but noisy counting signal.\nOur method utilizes the ranking image pairs for pre-training and then fits a\nlinear layer to the noisy synthetic images using these crowd quantity features.\nWe report state-of-the-art results for unsupervised crowd counting.\n","authors":["Adriano D'Alessandro","Ali Mahdavi-Amiri","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2310.01662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01659v1","updated":"2023-10-02T21:48:19Z","published":"2023-10-02T21:48:19Z","title":"It's all about you: Personalized in-Vehicle Gesture Recognition with a\n  Time-of-Flight Camera","summary":"  Despite significant advances in gesture recognition technology, recognizing\ngestures in a driving environment remains challenging due to limited and costly\ndata and its dynamic, ever-changing nature. In this work, we propose a\nmodel-adaptation approach to personalize the training of a CNNLSTM model and\nimprove recognition accuracy while reducing data requirements. Our approach\ncontributes to the field of dynamic hand gesture recognition while driving by\nproviding a more efficient and accurate method that can be customized for\nindividual users, ultimately enhancing the safety and convenience of in-vehicle\ninteractions, as well as driver's experience and system trust. We incorporate\nhardware enhancement using a time-of-flight camera and algorithmic enhancement\nthrough data augmentation, personalized adaptation, and incremental learning\ntechniques. We evaluate the performance of our approach in terms of recognition\naccuracy, achieving up to 90\\%, and show the effectiveness of personalized\nadaptation and incremental learning for a user-centered design.\n","authors":["Amr Gomaa","Guillermo Reyes","Michael Feld"],"pdf_url":"https://arxiv.org/pdf/2310.01659v1.pdf","comment":"Accepted at AutoUI2023"},{"id":"http://arxiv.org/abs/2309.16889v2","updated":"2023-10-02T21:28:54Z","published":"2023-09-28T23:09:30Z","title":"Superpixel Transformers for Efficient Semantic Segmentation","summary":"  Semantic segmentation, which aims to classify every pixel in an image, is a\nkey task in machine perception, with many applications across robotics and\nautonomous driving. Due to the high dimensionality of this task, most existing\napproaches use local operations, such as convolutions, to generate per-pixel\nfeatures. However, these methods are typically unable to effectively leverage\nglobal context information due to the high computational costs of operating on\na dense image. In this work, we propose a solution to this issue by leveraging\nthe idea of superpixels, an over-segmentation of the image, and applying them\nwith a modern transformer framework. In particular, our model learns to\ndecompose the pixel space into a spatially low dimensional superpixel space via\na series of local cross-attentions. We then apply multi-head self-attention to\nthe superpixels to enrich the superpixel features with global context and then\ndirectly produce a class prediction for each superpixel. Finally, we directly\nproject the superpixel class predictions back into the pixel space using the\nassociations between the superpixels and the image pixel features. Reasoning in\nthe superpixel space allows our method to be substantially more computationally\nefficient compared to convolution-based decoder methods. Yet, our method\nachieves state-of-the-art performance in semantic segmentation due to the rich\nsuperpixel features generated by the global self-attention mechanism. Our\nexperiments on Cityscapes and ADE20K demonstrate that our method matches the\nstate of the art in terms of accuracy, while outperforming in terms of model\nparameters and latency.\n","authors":["Alex Zihao Zhu","Jieru Mei","Siyuan Qiao","Hang Yan","Yukun Zhu","Liang-Chieh Chen","Henrik Kretzschmar"],"pdf_url":"https://arxiv.org/pdf/2309.16889v2.pdf","comment":"8 pages, 5 figures, 4 tables. Presented at IROS 2023. Equal\n  contribution by A. Zhu and J. Mei"},{"id":"http://arxiv.org/abs/2310.01641v1","updated":"2023-10-02T21:09:43Z","published":"2023-10-02T21:09:43Z","title":"You Only Look at Once for Real-time and Generic Multi-Task","summary":"  High precision, lightweight, and real-time responsiveness are three essential\nrequirements for implementing autonomous driving. Considering all of them\nsimultaneously is a challenge. In this study, we present an adaptive,\nreal-time, and lightweight multi-task model designed to concurrently handle\nobject detection, drivable area segmentation, and lane detection tasks. To\nachieve this research objective, we developed an end-to-end multi-task model\nwith a unified and streamlined segmentation structure. Our model operates\nwithout the need for any specific customization structure or loss function. We\nachieved competitive results on the BDD100k dataset, particularly in\nvisualization outcomes. The performance results show a mAP50 of 81.1% for\nobject detection, a mIoU of 91.0% for drivable area segmentation, and an IoU of\n28.8% for lane line segmentation. Additionally, we introduced a real-road\ndataset to evaluate our model's performance in a real scene, which\nsignificantly outperforms competitors. This demonstrates that our model not\nonly exhibits competitive performance but is also more flexible and faster than\nexisting multi-task models. The source codes and pre-trained models are\nreleased at https://github.com/JiayuanWang-JW/YOLOv8-multi-task\n","authors":["Jiayuan Wang","Q. M. Jonathan Wu","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01636v1","updated":"2023-10-02T21:02:23Z","published":"2023-10-02T21:02:23Z","title":"Adaptive Visual Scene Understanding: Incremental Scene Graph Generation","summary":"  Scene graph generation (SGG) involves analyzing images to extract meaningful\ninformation about objects and their relationships. Given the dynamic nature of\nthe visual world, it becomes crucial for AI systems to detect new objects and\nestablish their new relationships with existing objects. To address the lack of\ncontinual learning methodologies in SGG, we introduce the comprehensive\nContinual ScenE Graph Generation (CSEGG) dataset along with 3 learning\nscenarios and 8 evaluation metrics. Our research investigates the continual\nlearning performances of existing SGG methods on the retention of previous\nobject entities and relationships as they learn new ones. Moreover, we also\nexplore how continual object detection enhances generalization in classifying\nknown relationships on unknown objects. We conduct extensive experiments\nbenchmarking and analyzing the classical two-stage SGG methods and the most\nrecent transformer-based SGG methods in continual learning settings, and gain\nvaluable insights into the CSEGG problem. We invite the research community to\nexplore this emerging field of study.\n","authors":["Naitik Khandelwal","Xiao Liu","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03266v3","updated":"2023-10-02T20:47:01Z","published":"2023-07-06T20:00:52Z","title":"Empirical Analysis of a Segmentation Foundation Model in Prostate\n  Imaging","summary":"  Most state-of-the-art techniques for medical image segmentation rely on\ndeep-learning models. These models, however, are often trained on\nnarrowly-defined tasks in a supervised fashion, which requires expensive\nlabeled datasets. Recent advances in several machine learning domains, such as\nnatural language generation have demonstrated the feasibility and utility of\nbuilding foundation models that can be customized for various downstream tasks\nwith little to no labeled data. This likely represents a paradigm shift for\nmedical imaging, where we expect that foundation models may shape the future of\nthe field. In this paper, we consider a recently developed foundation model for\nmedical image segmentation, UniverSeg. We conduct an empirical evaluation study\nin the context of prostate imaging and compare it against the conventional\napproach of training a task-specific segmentation model. Our results and\ndiscussion highlight several important factors that will likely be important in\nthe development and adoption of foundation models for medical image\nsegmentation.\n","authors":["Heejong Kim","Victor Ion Butoi","Adrian V. Dalca","Daniel J. A. Margolis","Mert R. Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2307.03266v3.pdf","comment":"Accepted to MICCAI MedAGI workshop"},{"id":"http://arxiv.org/abs/2210.02808v2","updated":"2023-10-02T20:29:21Z","published":"2022-10-06T10:38:07Z","title":"Effective Self-supervised Pre-training on Low-compute Networks without\n  Distillation","summary":"  Despite the impressive progress of self-supervised learning (SSL), its\napplicability to low-compute networks has received limited attention. Reported\nperformance has trailed behind standard supervised pre-training by a large\nmargin, barring self-supervised learning from making an impact on models that\nare deployed on device. Most prior works attribute this poor performance to the\ncapacity bottleneck of the low-compute networks and opt to bypass the problem\nthrough the use of knowledge distillation (KD). In this work, we revisit SSL\nfor efficient neural networks, taking a closer at what are the detrimental\nfactors causing the practical limitations, and whether they are intrinsic to\nthe self-supervised low-compute setting. We find that, contrary to accepted\nknowledge, there is no intrinsic architectural bottleneck, we diagnose that the\nperformance bottleneck is related to the model complexity vs regularization\nstrength trade-off. In particular, we start by empirically observing that the\nuse of local views can have a dramatic impact on the effectiveness of the SSL\nmethods. This hints at view sampling being one of the performance bottlenecks\nfor SSL on low-capacity networks. We hypothesize that the view sampling\nstrategy for large neural networks, which requires matching views in very\ndiverse spatial scales and contexts, is too demanding for low-capacity\narchitectures. We systematize the design of the view sampling mechanism,\nleading to a new training methodology that consistently improves the\nperformance across different SSL methods (e.g. MoCo-v2, SwAV, DINO), different\nlow-size networks (e.g. MobileNetV2, ResNet18, ResNet34, ViT-Ti), and different\ntasks (linear probe, object detection, instance segmentation and\nsemi-supervised learning). Our best models establish a new state-of-the-art for\nSSL methods on low-compute networks despite not using a KD loss term.\n","authors":["Fuwen Tan","Fatemeh Saleh","Brais Martinez"],"pdf_url":"https://arxiv.org/pdf/2210.02808v2.pdf","comment":"ICLR 2023 Camera Ready. Code is publicly available at\n  https://github.com/saic-fi/SSLight"},{"id":"http://arxiv.org/abs/2310.01617v1","updated":"2023-10-02T20:21:43Z","published":"2023-10-02T20:21:43Z","title":"Dynamic Spatio-Temporal Summarization using Information Based Fusion","summary":"  In the era of burgeoning data generation, managing and storing large-scale\ntime-varying datasets poses significant challenges. With the rise of\nsupercomputing capabilities, the volume of data produced has soared,\nintensifying storage and I/O overheads. To address this issue, we propose a\ndynamic spatio-temporal data summarization technique that identifies\ninformative features in key timesteps and fuses less informative ones. This\napproach minimizes storage requirements while preserving data dynamics. Unlike\nexisting methods, our method retains both raw and summarized timesteps,\nensuring a comprehensive view of information changes over time. We utilize\ninformation-theoretic measures to guide the fusion process, resulting in a\nvisual representation that captures essential data patterns. We demonstrate the\nversatility of our technique across diverse datasets, encompassing\nparticle-based flow simulations, security and surveillance applications, and\nbiological cell interactions within the immune system. Our research\nsignificantly contributes to the realm of data management, introducing enhanced\nefficiency and deeper insights across diverse multidisciplinary domains. We\nprovide a streamlined approach for handling massive datasets that can be\napplied to in situ analysis as well as post hoc analysis. This not only\naddresses the escalating challenges of data storage and I/O overheads but also\nunlocks the potential for informed decision-making. Our method empowers\nresearchers and experts to explore essential temporal dynamics while minimizing\nstorage requirements, thereby fostering a more effective and intuitive\nunderstanding of complex data behaviors.\n","authors":["Humayra Tasnim","Soumya Dutta","Melanie Moses"],"pdf_url":"https://arxiv.org/pdf/2310.01617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01596v1","updated":"2023-10-02T19:41:42Z","published":"2023-10-02T19:41:42Z","title":"ImagenHub: Standardizing the evaluation of conditional image generation\n  models","summary":"  Recently, a myriad of conditional image generation and editing models have\nbeen developed to serve different downstream tasks, including text-to-image\ngeneration, text-guided image editing, subject-driven image generation,\ncontrol-guided image generation, etc. However, we observe huge inconsistencies\nin experimental conditions: datasets, inference, and evaluation metrics -\nrender fair comparisons difficult. This paper proposes ImagenHub, which is a\none-stop library to standardize the inference and evaluation of all the\nconditional image generation models. Firstly, we define seven prominent tasks\nand curate high-quality evaluation datasets for them. Secondly, we built a\nunified inference pipeline to ensure fair comparison. Thirdly, we design two\nhuman evaluation scores, i.e. Semantic Consistency and Perceptual Quality,\nalong with comprehensive guidelines to evaluate generated images. We train\nexpert raters to evaluate the model outputs based on the proposed metrics. Our\nhuman evaluation achieves a high inter-worker agreement of Krippendorff's alpha\non 76% models with a value higher than 0.4. We comprehensively evaluated a\ntotal of around 30 models and observed three key takeaways: (1) the existing\nmodels' performance is generally unsatisfying except for Text-guided Image\nGeneration and Subject-driven Image Generation, with 74% models achieving an\noverall score lower than 0.5. (2) we examined the claims from published papers\nand found 83% of them hold with a few exceptions. (3) None of the existing\nautomatic metrics has a Spearman's correlation higher than 0.2 except\nsubject-driven image generation. Moving forward, we will continue our efforts\nto evaluate newly published models and update our leaderboard to keep track of\nthe progress in conditional image generation.\n","authors":["Max Ku","Tianle Li","Kai Zhang","Yujie Lu","Xingyu Fu","Wenwen Zhuang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14267v3","updated":"2023-10-02T18:57:51Z","published":"2022-10-25T18:45:08Z","title":"A Survey on Deep Generative 3D-aware Image Synthesis","summary":"  Recent years have seen remarkable progress in deep learning powered visual\ncontent creation. This includes deep generative 3D-aware image synthesis, which\nproduces high-idelity images in a 3D-consistent manner while simultaneously\ncapturing compact surfaces of objects from pure image collections without the\nneed for any 3D supervision, thus bridging the gap between 2D imagery and 3D\nreality. The ield of computer vision has been recently captivated by the task\nof deep generative 3D-aware image synthesis, with hundreds of papers appearing\nin top-tier journals and conferences over the past few years (mainly the past\ntwo years), but there lacks a comprehensive survey of this remarkable and swift\nprogress. Our survey aims to introduce new researchers to this topic, provide a\nuseful reference for related works, and stimulate future research directions\nthrough our discussion section. Apart from the presented papers, we aim to\nconstantly update the latest relevant papers along with corresponding\nimplementations at https://weihaox.github.io/3D-aware-Gen.\n","authors":["Weihao Xia","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2210.14267v3.pdf","comment":"Accepted to ACM Computing Surveys. Project page:\n  https://weihaox.github.io/3D-aware-Gen"},{"id":"http://arxiv.org/abs/2309.16967v2","updated":"2023-10-02T18:45:49Z","published":"2023-09-29T04:26:25Z","title":"nnSAM: Plug-and-play Segment Anything Model Improves nnUNet Performance","summary":"  The recent developments of foundation models in computer vision, especially\nthe Segment Anything Model (SAM), allow scalable and domain-agnostic image\nsegmentation to serve as a general-purpose segmentation tool. In parallel, the\nfield of medical image segmentation has benefited significantly from\nspecialized neural networks like the nnUNet, which is trained on\ndomain-specific datasets and can automatically configure the network to tailor\nto specific segmentation challenges. To combine the advantages of foundation\nmodels and domain-specific models, we present nnSAM, which synergistically\nintegrates the SAM model with the nnUNet model to achieve more accurate and\nrobust medical image segmentation. The nnSAM model leverages the powerful and\nrobust feature extraction capabilities of SAM, while harnessing the automatic\nconfiguration capabilities of nnUNet to promote dataset-tailored learning. Our\ncomprehensive evaluation of nnSAM model on different sizes of training samples\nshows that it allows few-shot learning, which is highly relevant for medical\nimage segmentation where high-quality, annotated data can be scarce and costly\nto obtain. By melding the strengths of both its predecessors, nnSAM positions\nitself as a potential new benchmark in medical image segmentation, offering a\ntool that combines broad applicability with specialized efficiency. The code is\navailable at https://github.com/Kent0n-Li/Medical-Image-Segmentation.\n","authors":["Yunxiang Li","Bowen Jing","Zihan Li","Jing Wang","You Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.16967v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2310.01545v1","updated":"2023-10-02T18:41:23Z","published":"2023-10-02T18:41:23Z","title":"RF-ULM: Deep Learning for Radio-Frequency Ultrasound Localization\n  Microscopy","summary":"  In Ultrasound Localization Microscopy (ULM), achieving high-resolution images\nrelies on the precise localization of contrast agent particles across\nconsecutive beamformed frames. However, our study uncovers an enormous\npotential: The process of delay-and-sum beamforming leads to an irreversible\nreduction of Radio-Frequency (RF) data, while its implications for localization\nremain largely unexplored. The rich contextual information embedded within RF\nwavefronts, including their hyperbolic shape and phase, offers great promise\nfor guiding Deep Neural Networks (DNNs) in challenging localization scenarios.\nTo fully exploit this data, we propose to directly localize scatterers in RF\nsignals. Our approach involves a custom super-resolution DNN using learned\nfeature channel shuffling and a novel semi-global convolutional sampling block\ntailored for reliable and accurate localization in RF input data. Additionally,\nwe introduce a geometric point transformation that facilitates seamless mapping\nbetween B-mode and RF spaces. To validate the effectiveness of our method and\nunderstand the impact of beamforming, we conduct an extensive comparison with\nState-Of-The-Art (SOTA) techniques in ULM. We present the inaugural in vivo\nresults from an RF-trained DNN, highlighting its real-world practicality. Our\nfindings show that RF-ULM bridges the domain gap between synthetic and real\ndatasets, offering a considerable advantage in terms of precision and\ncomplexity. To enable the broader research community to benefit from our\nfindings, our code and the associated SOTA methods are made available at\nhttps://github.com/hahnec/rf-ulm.\n","authors":["Christopher Hahne","Georges Chabouh","Arthur Chavignon","Olivier Couture","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2310.01545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01529v1","updated":"2023-10-02T18:17:20Z","published":"2023-10-02T18:17:20Z","title":"Progressive DeepSSM: Training Methodology for Image-To-Shape Deep Models","summary":"  Statistical shape modeling (SSM) is an enabling quantitative tool to study\nanatomical shapes in various medical applications. However, directly using 3D\nimages in these applications still has a long way to go. Recent deep learning\nmethods have paved the way for reducing the substantial preprocessing steps to\nconstruct SSMs directly from unsegmented images. Nevertheless, the performance\nof these models is not up to the mark. Inspired by multiscale/multiresolution\nlearning, we propose a new training strategy, progressive DeepSSM, to train\nimage-to-shape deep learning models. The training is performed in multiple\nscales, and each scale utilizes the output from the previous scale. This\nstrategy enables the model to learn coarse shape features in the first scales\nand gradually learn detailed fine shape features in the later scales. We\nleverage shape priors via segmentation-guided multi-task learning and employ\ndeep supervision loss to ensure learning at each scale. Experiments show the\nsuperiority of models trained by the proposed strategy from both quantitative\nand qualitative perspectives. This training methodology can be employed to\nimprove the stability and accuracy of any deep learning method for inferring\nstatistical representations of anatomies from medical images and can be adopted\nby existing deep learning methods to improve model accuracy and training\nstability.\n","authors":["Abu Zahid Bin Aziz","Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2310.01529v1.pdf","comment":"Accepted in ShapeMI MICCAI 2023: Workshop on Shape in Medical Imaging"},{"id":"http://arxiv.org/abs/2310.01523v1","updated":"2023-10-02T18:14:23Z","published":"2023-10-02T18:14:23Z","title":"Fetal-BET: Brain Extraction Tool for Fetal MRI","summary":"  Fetal brain extraction is a necessary first step in most computational fetal\nbrain MRI pipelines. However, it has been a very challenging task due to\nnon-standard fetal head pose, fetal movements during examination, and vastly\nheterogeneous appearance of the developing fetal brain and the neighboring\nfetal and maternal anatomy across various sequences and scanning conditions.\nDevelopment of a machine learning method to effectively address this task\nrequires a large and rich labeled dataset that has not been previously\navailable. As a result, there is currently no method for accurate fetal brain\nextraction on various fetal MRI sequences. In this work, we first built a large\nannotated dataset of approximately 72,000 2D fetal brain MRI images. Our\ndataset covers the three common MRI sequences including T2-weighted,\ndiffusion-weighted, and functional MRI acquired with different scanners.\nMoreover, it includes normal and pathological brains. Using this dataset, we\ndeveloped and validated deep learning methods, by exploiting the power of the\nU-Net style architectures, the attention mechanism, multi-contrast feature\nlearning, and data augmentation for fast, accurate, and generalizable automatic\nfetal brain extraction. Our approach leverages the rich information from\nmulti-contrast (multi-sequence) fetal MRI data, enabling precise delineation of\nthe fetal brain structures. Evaluations on independent test data show that our\nmethod achieves accurate brain extraction on heterogeneous test data acquired\nwith different scanners, on pathological brains, and at various gestational\nstages. This robustness underscores the potential utility of our deep learning\nmodel for fetal brain imaging and image analysis.\n","authors":["Razieh Faghihpirayesh","Davood Karimi","Deniz Erdoğmuş","Ali Gholipour"],"pdf_url":"https://arxiv.org/pdf/2310.01523v1.pdf","comment":"10 pages, 6 figures, 2 TABLES, This work has been submitted to the\n  IEEE Transactions on Medical Imaging for possible publication. Copyright may\n  be transferred without notice, after which this version may no longer be\n  accessible"},{"id":"http://arxiv.org/abs/2310.01506v1","updated":"2023-10-02T18:01:55Z","published":"2023-10-02T18:01:55Z","title":"Direct Inversion: Boosting Diffusion-based Editing with 3 Lines of Code","summary":"  Text-guided diffusion models have revolutionized image generation and\nediting, offering exceptional realism and diversity. Specifically, in the\ncontext of diffusion-based editing, where a source image is edited according to\na target prompt, the process commences by acquiring a noisy latent vector\ncorresponding to the source image via the diffusion model. This vector is\nsubsequently fed into separate source and target diffusion branches for\nediting. The accuracy of this inversion process significantly impacts the final\nediting outcome, influencing both essential content preservation of the source\nimage and edit fidelity according to the target prompt. Prior inversion\ntechniques aimed at finding a unified solution in both the source and target\ndiffusion branches. However, our theoretical and empirical analyses reveal that\ndisentangling these branches leads to a distinct separation of responsibilities\nfor preserving essential content and ensuring edit fidelity. Building on this\ninsight, we introduce \"Direct Inversion,\" a novel technique achieving optimal\nperformance of both branches with just three lines of code. To assess image\nediting performance, we present PIE-Bench, an editing benchmark with 700 images\nshowcasing diverse scenes and editing types, accompanied by versatile\nannotations and comprehensive evaluation metrics. Compared to state-of-the-art\noptimization-based inversion techniques, our solution not only yields superior\nperformance across 8 editing methods but also achieves nearly an order of\nspeed-up.\n","authors":["Xuan Ju","Ailing Zeng","Yuxuan Bian","Shaoteng Liu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16772v2","updated":"2023-10-02T18:01:08Z","published":"2023-09-28T18:09:40Z","title":"XVO: Generalized Visual Odometry via Cross-Modal Self-Training","summary":"  We propose XVO, a semi-supervised learning method for training generalized\nmonocular Visual Odometry (VO) models with robust off-the-self operation across\ndiverse datasets and settings. In contrast to standard monocular VO approaches\nwhich often study a known calibration within a single dataset, XVO efficiently\nlearns to recover relative pose with real-world scale from visual scene\nsemantics, i.e., without relying on any known camera parameters. We optimize\nthe motion estimation model via self-training from large amounts of\nunconstrained and heterogeneous dash camera videos available on YouTube. Our\nkey contribution is twofold. First, we empirically demonstrate the benefits of\nsemi-supervised training for learning a general-purpose direct VO regression\nnetwork. Second, we demonstrate multi-modal supervision, including\nsegmentation, flow, depth, and audio auxiliary prediction tasks, to facilitate\ngeneralized representations for the VO task. Specifically, we find audio\nprediction task to significantly enhance the semi-supervised learning process\nwhile alleviating noisy pseudo-labels, particularly in highly dynamic and\nout-of-domain video data. Our proposed teacher network achieves\nstate-of-the-art performance on the commonly used KITTI benchmark despite no\nmulti-frame optimization or knowledge of camera parameters. Combined with the\nproposed semi-supervised step, XVO demonstrates off-the-shelf knowledge\ntransfer across diverse conditions on KITTI, nuScenes, and Argoverse without\nfine-tuning.\n","authors":["Lei Lai","Zhongkai Shangguan","Jimuyang Zhang","Eshed Ohn-Bar"],"pdf_url":"https://arxiv.org/pdf/2309.16772v2.pdf","comment":"ICCV 2023, Paris https://genxvo.github.io/"},{"id":"http://arxiv.org/abs/2310.01415v1","updated":"2023-10-02T17:59:57Z","published":"2023-10-02T17:59:57Z","title":"GPT-Driver: Learning to Drive with GPT","summary":"  We present a simple yet effective approach that can transform the OpenAI\nGPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion\nplanning is a core challenge in autonomous driving, aiming to plan a driving\ntrajectory that is safe and comfortable. Existing motion planners predominantly\nleverage heuristic methods to forecast driving trajectories, yet these\napproaches demonstrate insufficient generalization capabilities in the face of\nnovel and unseen driving scenarios. In this paper, we propose a novel approach\nto motion planning that capitalizes on the strong reasoning capabilities and\ngeneralization potential inherent to Large Language Models (LLMs). The\nfundamental insight of our approach is the reformulation of motion planning as\na language modeling problem, a perspective not previously explored.\nSpecifically, we represent the planner inputs and outputs as language tokens,\nand leverage the LLM to generate driving trajectories through a language\ndescription of coordinate positions. Furthermore, we propose a novel\nprompting-reasoning-finetuning strategy to stimulate the numerical reasoning\npotential of the LLM. With this strategy, the LLM can describe highly precise\ntrajectory coordinates and also its internal decision-making process in natural\nlanguage. We evaluate our approach on the large-scale nuScenes dataset, and\nextensive experiments substantiate the effectiveness, generalization ability,\nand interpretability of our GPT-based motion planner. Code will be released\nupon acceptance.\n","authors":["Jiageng Mao","Yuxi Qian","Hang Zhao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01413v1","updated":"2023-10-02T17:59:56Z","published":"2023-10-02T17:59:56Z","title":"A multi-institutional pediatric dataset of clinical radiology MRIs by\n  the Children's Brain Tumor Network","summary":"  Pediatric brain and spinal cancers remain the leading cause of cancer-related\ndeath in children. Advancements in clinical decision-support in pediatric\nneuro-oncology utilizing the wealth of radiology imaging data collected through\nstandard care, however, has significantly lagged other domains. Such data is\nripe for use with predictive analytics such as artificial intelligence (AI)\nmethods, which require large datasets. To address this unmet need, we provide a\nmulti-institutional, large-scale pediatric dataset of 23,101 multi-parametric\nMRI exams acquired through routine care for 1,526 brain tumor patients, as part\nof the Children's Brain Tumor Network. This includes longitudinal MRIs across\nvarious cancer diagnoses, with associated patient-level clinical information,\ndigital pathology slides, as well as tissue genotype and omics data. To\nfacilitate downstream analysis, treatment-na\\\"ive images for 370 subjects were\nprocessed and released through the NCI Childhood Cancer Data Initiative via the\nCancer Data Service. Through ongoing efforts to continuously build these\nimaging repositories, our aim is to accelerate discovery and translational AI\nmodels with real-world data, to ultimately empower precision medicine for\nchildren.\n","authors":["Ariana M. Familiar","Anahita Fathi Kazerooni","Hannah Anderson","Aliaksandr Lubneuski","Karthik Viswanathan","Rocky Breslow","Nastaran Khalili","Sina Bagheri","Debanjan Haldar","Meen Chul Kim","Sherjeel Arif","Rachel Madhogarhia","Thinh Q. Nguyen","Elizabeth A. Frenkel","Zeinab Helili","Jessica Harrison","Keyvan Farahani","Marius George Linguraru","Ulas Bagci","Yury Velichko","Jeffrey Stevens","Sarah Leary","Robert M. Lober","Stephani Campion","Amy A. Smith","Denise Morinigo","Brian Rood","Kimberly Diamond","Ian F. Pollack","Melissa Williams","Arastoo Vossough","Jeffrey B. Ware","Sabine Mueller","Phillip B. Storm","Allison P. Heath","Angela J. Waanders","Jena V. Lilly","Jennifer L. Mason","Adam C. Resnick","Ali Nabavizadeh"],"pdf_url":"https://arxiv.org/pdf/2310.01413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01412v1","updated":"2023-10-02T17:59:52Z","published":"2023-10-02T17:59:52Z","title":"DriveGPT4: Interpretable End-to-end Autonomous Driving via Large\n  Language Model","summary":"  In the past decade, autonomous driving has experienced rapid development in\nboth academia and industry. However, its limited interpretability remains a\nsignificant unsolved problem, severely hindering autonomous vehicle\ncommercialization and further development. Previous approaches utilizing small\nlanguage models have failed to address this issue due to their lack of\nflexibility, generalization ability, and robustness. Recently, multimodal large\nlanguage models (LLMs) have gained considerable attention from the research\ncommunity for their capability to process and reason non-text data (e.g.,\nimages and videos) by text. In this paper, we present DriveGPT4, an\ninterpretable end-to-end autonomous driving system utilizing LLMs. DriveGPT4 is\ncapable of interpreting vehicle actions and providing corresponding reasoning,\nas well as answering diverse questions posed by human users for enhanced\ninteraction. Additionally, DriveGPT4 predicts vehicle low-level control signals\nin an end-to-end fashion. These capabilities stem from a customized visual\ninstruction tuning dataset specifically designed for autonomous driving. To the\nbest of our knowledge, DriveGPT4 is the first work focusing on interpretable\nend-to-end autonomous driving. When evaluated on multiple tasks alongside\nconventional methods and video understanding LLMs, DriveGPT4 demonstrates\nsuperior qualitative and quantitative performance. Additionally, DriveGPT4 can\nbe generalized in a zero-shot fashion to accommodate more unseen scenarios. The\nproject page is available at https://tonyxuqaq.github.io/projects/DriveGPT4/ .\n","authors":["Zhenhua Xu","Yujia Zhang","Enze Xie","Zhen Zhao","Yong Guo","Kenneth K. Y. Wong","Zhenguo Li","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.01412v1.pdf","comment":"The project page is available at\n  https://tonyxuqaq.github.io/projects/DriveGPT4/"},{"id":"http://arxiv.org/abs/2310.01410v1","updated":"2023-10-02T17:59:37Z","published":"2023-10-02T17:59:37Z","title":"LEAP: Liberate Sparse-view 3D Modeling from Camera Poses","summary":"  Are camera poses necessary for multi-view 3D modeling? Existing approaches\npredominantly assume access to accurate camera poses. While this assumption\nmight hold for dense views, accurately estimating camera poses for sparse views\nis often elusive. Our analysis reveals that noisy estimated poses lead to\ndegraded performance for existing sparse-view 3D modeling methods. To address\nthis issue, we present LEAP, a novel pose-free approach, therefore challenging\nthe prevailing notion that camera poses are indispensable. LEAP discards\npose-based operations and learns geometric knowledge from data. LEAP is\nequipped with a neural volume, which is shared across scenes and is\nparameterized to encode geometry and texture priors. For each incoming scene,\nwe update the neural volume by aggregating 2D image features in a\nfeature-similarity-driven manner. The updated neural volume is decoded into the\nradiance field, enabling novel view synthesis from any viewpoint. On both\nobject-centric and scene-level datasets, we show that LEAP significantly\noutperforms prior methods when they employ predicted poses from\nstate-of-the-art pose estimators. Notably, LEAP performs on par with prior\napproaches that use ground-truth poses while running $400\\times$ faster than\nPixelNeRF. We show LEAP generalizes to novel object categories and scenes, and\nlearns knowledge closely resembles epipolar geometry. Project page:\nhttps://hwjiang1510.github.io/LEAP/\n","authors":["Hanwen Jiang","Zhenyu Jiang","Yue Zhao","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.01410v1.pdf","comment":"Project page https://hwjiang1510.github.io/LEAP/"},{"id":"http://arxiv.org/abs/2310.01407v1","updated":"2023-10-02T17:59:18Z","published":"2023-10-02T17:59:18Z","title":"Conditional Diffusion Distillation","summary":"  Generative diffusion models provide strong priors for text-to-image\ngeneration and thereby serve as a foundation for conditional generation tasks\nsuch as image editing, restoration, and super-resolution. However, one major\nlimitation of diffusion models is their slow sampling time. To address this\nchallenge, we present a novel conditional distillation method designed to\nsupplement the diffusion priors with the help of image conditions, allowing for\nconditional sampling with very few steps. We directly distill the unconditional\npre-training in a single stage through joint-learning, largely simplifying the\nprevious two-stage procedures that involve both distillation and conditional\nfinetuning separately. Furthermore, our method enables a new\nparameter-efficient distillation mechanism that distills each task with only a\nsmall number of additional parameters combined with the shared frozen\nunconditional backbone. Experiments across multiple tasks including\nsuper-resolution, image editing, and depth-to-image generation demonstrate that\nour method outperforms existing distillation techniques for the same sampling\ntime. Notably, our method is the first distillation strategy that can match the\nperformance of the much slower fine-tuned conditional diffusion models.\n","authors":["Kangfu Mei","Mauricio Delbracio","Hossein Talebi","Zhengzhong Tu","Vishal M. Patel","Peyman Milanfar"],"pdf_url":"https://arxiv.org/pdf/2310.01407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01406v1","updated":"2023-10-02T17:59:17Z","published":"2023-10-02T17:59:17Z","title":"HumanNorm: Learning Normal Diffusion Model for High-quality and\n  Realistic 3D Human Generation","summary":"  Recent text-to-3D methods employing diffusion models have made significant\nadvancements in 3D human generation. However, these approaches face challenges\ndue to the limitations of the text-to-image diffusion model, which lacks an\nunderstanding of 3D structures. Consequently, these methods struggle to achieve\nhigh-quality human generation, resulting in smooth geometry and cartoon-like\nappearances. In this paper, we observed that fine-tuning text-to-image\ndiffusion models with normal maps enables their adaptation into text-to-normal\ndiffusion models, which enhances the 2D perception of 3D geometry while\npreserving the priors learned from large-scale datasets. Therefore, we propose\nHumanNorm, a novel approach for high-quality and realistic 3D human generation\nby learning the normal diffusion model including a normal-adapted diffusion\nmodel and a normal-aligned diffusion model. The normal-adapted diffusion model\ncan generate high-fidelity normal maps corresponding to prompts with\nview-dependent text. The normal-aligned diffusion model learns to generate\ncolor images aligned with the normal maps, thereby transforming physical\ngeometry details into realistic appearance. Leveraging the proposed normal\ndiffusion model, we devise a progressive geometry generation strategy and\ncoarse-to-fine texture generation strategy to enhance the efficiency and\nrobustness of 3D human generation. Comprehensive experiments substantiate our\nmethod's ability to generate 3D humans with intricate geometry and realistic\nappearances, significantly outperforming existing text-to-3D methods in both\ngeometry and texture quality. The project page of HumanNorm is\nhttps://humannorm.github.io/.\n","authors":["Xin Huang","Ruizhi Shao","Qi Zhang","Hongwen Zhang","Ying Feng","Yebin Liu","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01406v1.pdf","comment":"The project page of HumanNorm is https://humannorm.github.io/"},{"id":"http://arxiv.org/abs/2310.01404v1","updated":"2023-10-02T17:59:03Z","published":"2023-10-02T17:59:03Z","title":"H-InDex: Visual Reinforcement Learning with Hand-Informed\n  Representations for Dexterous Manipulation","summary":"  Human hands possess remarkable dexterity and have long served as a source of\ninspiration for robotic manipulation. In this work, we propose a human\n$\\textbf{H}$and$\\textbf{-In}$formed visual representation learning framework to\nsolve difficult $\\textbf{Dex}$terous manipulation tasks ($\\textbf{H-InDex}$)\nwith reinforcement learning. Our framework consists of three stages: (i)\npre-training representations with 3D human hand pose estimation, (ii) offline\nadapting representations with self-supervised keypoint detection, and (iii)\nreinforcement learning with exponential moving average BatchNorm. The last two\nstages only modify $0.36\\%$ parameters of the pre-trained representation in\ntotal, ensuring the knowledge from pre-training is maintained to the full\nextent. We empirically study 12 challenging dexterous manipulation tasks and\nfind that H-InDex largely surpasses strong baseline methods and the recent\nvisual foundation models for motor control. Code is available at\nhttps://yanjieze.com/H-InDex .\n","authors":["Yanjie Ze","Yuyao Liu","Ruizhe Shi","Jiaxin Qin","Zhecheng Yuan","Jiashun Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01404v1.pdf","comment":"NeurIPS 2023. Code and videos: https://yanjieze.com/H-InDex"},{"id":"http://arxiv.org/abs/2310.01403v1","updated":"2023-10-02T17:58:52Z","published":"2023-10-02T17:58:52Z","title":"CLIPSelf: Vision Transformer Distills Itself for Open-Vocabulary Dense\n  Prediction","summary":"  Open-vocabulary dense prediction tasks including object detection and image\nsegmentation have been advanced by the success of Contrastive Language-Image\nPre-training (CLIP). CLIP models, particularly those incorporating vision\ntransformers (ViTs), have exhibited remarkable generalization ability in\nzero-shot image classification. However, when transferring the vision-language\nalignment of CLIP from global image representation to local region\nrepresentation for the open-vocabulary dense prediction tasks, CLIP ViTs suffer\nfrom the domain shift from full images to local image regions. In this paper,\nwe embark on an in-depth analysis of the region-language alignment in CLIP\nmodels, which is essential for downstream open-vocabulary dense prediction\ntasks. Subsequently, we propose an approach named CLIPSelf, which adapts the\nimage-level recognition ability of CLIP ViT to local image regions without\nneeding any region-text pairs. CLIPSelf empowers ViTs to distill itself by\naligning a region representation extracted from its dense feature map with the\nimage-level representation of the corresponding image crop. With the enhanced\nCLIP ViTs, we achieve new state-of-the-art performance on open-vocabulary\nobject detection, semantic segmentation, and panoptic segmentation across\nvarious benchmarks. Models and code will be available at\nhttps://github.com/wusize/CLIPSelf.\n","authors":["Size Wu","Wenwei Zhang","Lumin Xu","Sheng Jin","Xiangtai Li","Wentao Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2310.01403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01401v1","updated":"2023-10-02T17:58:51Z","published":"2023-10-02T17:58:51Z","title":"Pixel-Aligned Recurrent Queries for Multi-View 3D Object Detection","summary":"  We present PARQ - a multi-view 3D object detector with transformer and\npixel-aligned recurrent queries. Unlike previous works that use learnable\nfeatures or only encode 3D point positions as queries in the decoder, PARQ\nleverages appearance-enhanced queries initialized from reference points in 3D\nspace and updates their 3D location with recurrent cross-attention operations.\nIncorporating pixel-aligned features and cross attention enables the model to\nencode the necessary 3D-to-2D correspondences and capture global contextual\ninformation of the input images. PARQ outperforms prior best methods on the\nScanNet and ARKitScenes datasets, learns and detects faster, is more robust to\ndistribution shifts in reference points, can leverage additional input views\nwithout retraining, and can adapt inference compute by changing the number of\nrecurrent iterations.\n","authors":["Yiming Xie","Huaizu Jiang","Georgia Gkioxari","Julian Straub"],"pdf_url":"https://arxiv.org/pdf/2310.01401v1.pdf","comment":"ICCV 2023. Project page: https://ymingxie.github.io/parq"},{"id":"http://arxiv.org/abs/2310.01400v1","updated":"2023-10-02T17:58:47Z","published":"2023-10-02T17:58:47Z","title":"Sequential Data Generation with Groupwise Diffusion Process","summary":"  We present the Groupwise Diffusion Model (GDM), which divides data into\nmultiple groups and diffuses one group at one time interval in the forward\ndiffusion process. GDM generates data sequentially from one group at one time\ninterval, leading to several interesting properties. First, as an extension of\ndiffusion models, GDM generalizes certain forms of autoregressive models and\ncascaded diffusion models. As a unified framework, GDM allows us to investigate\ndesign choices that have been overlooked in previous works, such as\ndata-grouping strategy and order of generation. Furthermore, since one group of\nthe initial noise affects only a certain group of the generated data, latent\nspace now possesses group-wise interpretable meaning. We can further extend GDM\nto the frequency domain where the forward process sequentially diffuses each\ngroup of frequency components. Dividing the frequency bands of the data as\ngroups allows the latent variables to become a hierarchical representation\nwhere individual groups encode data at different levels of abstraction. We\ndemonstrate several applications of such representation including\ndisentanglement of semantic attributes, image editing, and generating\nvariations.\n","authors":["Sangyun Lee","Gayoung Lee","Hyunsu Kim","Junho Kim","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2310.01400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01393v1","updated":"2023-10-02T17:52:24Z","published":"2023-10-02T17:52:24Z","title":"DST-Det: Simple Dynamic Self-Training for Open-Vocabulary Object\n  Detection","summary":"  Open-vocabulary object detection (OVOD) aims to detect the objects beyond the\nset of categories observed during training. This work presents a simple yet\neffective strategy that leverages the zero-shot classification ability of\npre-trained vision-language models (VLM), such as CLIP, to classify proposals\nfor all possible novel classes directly. Unlike previous works that ignore\nnovel classes during training and rely solely on the region proposal network\n(RPN) for novel object detection, our method selectively filters proposals\nbased on specific design criteria. The resulting sets of identified proposals\nserve as pseudo-labels for novel classes during the training phase. It enables\nour self-training strategy to improve the recall and accuracy of novel classes\nin a self-training manner without requiring additional annotations or datasets.\nWe further propose a simple offline pseudo-label generation strategy to refine\nthe object detector. Empirical evaluations on three datasets, including LVIS,\nV3Det, and COCO, demonstrate significant improvements over the baseline\nperformance without incurring additional parameters or computational costs\nduring inference. In particular, compared with previous F-VLM, our method\nachieves a 1.7-2.0% improvement on LVIS dataset and 2.3-3.8% improvement on the\nrecent challenging V3Det dataset. Our method also boosts the strong baseline by\n6% mAP on COCO. The code and models will be publicly available at\nhttps://github.com/xushilin1/dst-det.\n","authors":["Shilin Xu","Xiangtai Li","Size Wu","Wenwei Zhang","Yining Li","Guangliang Cheng","Yunhai Tong","Kai Chen","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2310.01393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01379v1","updated":"2023-10-02T17:41:56Z","published":"2023-10-02T17:41:56Z","title":"EXTRACTER: Efficient Texture Matching with Attention and Gradient\n  Enhancing for Large Scale Image Super Resolution","summary":"  Recent Reference-Based image super-resolution (RefSR) has improved SOTA deep\nmethods introducing attention mechanisms to enhance low-resolution images by\ntransferring high-resolution textures from a reference high-resolution image.\nThe main idea is to search for matches between patches using LR and Reference\nimage pair in a feature space and merge them using deep architectures. However,\nexisting methods lack the accurate search of textures. They divide images into\nas many patches as possible, resulting in inefficient memory usage, and cannot\nmanage large images. Herein, we propose a deep search with a more efficient\nmemory usage that reduces significantly the number of image patches and finds\nthe $k$ most relevant texture match for each low-resolution patch over the\nhigh-resolution reference patches, resulting in an accurate texture match. We\nenhance the Super Resolution result adding gradient density information using a\nsimple residual architecture showing competitive metrics results: PSNR and\nSSMI.\n","authors":["Esteban Reyes-Saldana","Mariano Rivera"],"pdf_url":"https://arxiv.org/pdf/2310.01379v1.pdf","comment":"5 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.01376v1","updated":"2023-10-02T17:39:58Z","published":"2023-10-02T17:39:58Z","title":"Towards Distribution-Agnostic Generalized Category Discovery","summary":"  Data imbalance and open-ended distribution are two intrinsic characteristics\nof the real visual world. Though encouraging progress has been made in tackling\neach challenge separately, few works dedicated to combining them towards\nreal-world scenarios. While several previous works have focused on classifying\nclose-set samples and detecting open-set samples during testing, it's still\nessential to be able to classify unknown subjects as human beings. In this\npaper, we formally define a more realistic task as distribution-agnostic\ngeneralized category discovery (DA-GCD): generating fine-grained predictions\nfor both close- and open-set classes in a long-tailed open-world setting. To\ntackle the challenging problem, we propose a Self-Balanced Co-Advice\ncontrastive framework (BaCon), which consists of a contrastive-learning branch\nand a pseudo-labeling branch, working collaboratively to provide interactive\nsupervision to resolve the DA-GCD task. In particular, the contrastive-learning\nbranch provides reliable distribution estimation to regularize the predictions\nof the pseudo-labeling branch, which in turn guides contrastive learning\nthrough self-balanced knowledge transfer and a proposed novel contrastive loss.\nWe compare BaCon with state-of-the-art methods from two closely related fields:\nimbalanced semi-supervised learning and generalized category discovery. The\neffectiveness of BaCon is demonstrated with superior performance over all\nbaselines and comprehensive analysis across various datasets. Our code is\npublicly available.\n","authors":["Jianhong Bai","Zuozhu Liu","Hualiang Wang","Ruizhe Chen","Lianrui Mu","Xiaomeng Li","Joey Tianyi Zhou","Yang Feng","Jian Wu","Haoji Hu"],"pdf_url":"https://arxiv.org/pdf/2310.01376v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.01361v1","updated":"2023-10-02T17:23:48Z","published":"2023-10-02T17:23:48Z","title":"GenSim: Generating Robotic Simulation Tasks via Large Language Models","summary":"  Collecting large amounts of real-world interaction data to train general\nrobotic policies is often prohibitively expensive, thus motivating the use of\nsimulation data. However, existing methods for data generation have generally\nfocused on scene-level diversity (e.g., object instances and poses) rather than\ntask-level diversity, due to the human effort required to come up with and\nverify novel tasks. This has made it challenging for policies trained on\nsimulation data to demonstrate significant task-level generalization. In this\npaper, we propose to automatically generate rich simulation environments and\nexpert demonstrations by exploiting a large language models' (LLM) grounding\nand coding ability. Our approach, dubbed GenSim, has two modes: goal-directed\ngeneration, wherein a target task is given to the LLM and the LLM proposes a\ntask curriculum to solve the target task, and exploratory generation, wherein\nthe LLM bootstraps from previous tasks and iteratively proposes novel tasks\nthat would be helpful in solving more complex tasks. We use GPT4 to expand the\nexisting benchmark by ten times to over 100 tasks, on which we conduct\nsupervised finetuning and evaluate several LLMs including finetuned GPTs and\nCode Llama on code generation for robotic simulation tasks. Furthermore, we\nobserve that LLMs-generated simulation programs can enhance task-level\ngeneralization significantly when used for multitask policy training. We\nfurther find that with minimal sim-to-real adaptation, the multitask policies\npretrained on GPT4-generated simulation tasks exhibit stronger transfer to\nunseen long-horizon tasks in the real world and outperform baselines by 25%.\nSee the project website (https://liruiw.github.io/gensim) for code, demos, and\nvideos.\n","authors":["Lirui Wang","Yiyang Ling","Zhecheng Yuan","Mohit Shridhar","Chen Bao","Yuzhe Qin","Bailin Wang","Huazhe Xu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01361v1.pdf","comment":"See our project website (https://liruiw.github.io/gensim), demo\n  (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code\n  (https://github.com/liruiw/GenSim) for visualizations and open-source models\n  and datasets"},{"id":"http://arxiv.org/abs/2310.01358v1","updated":"2023-10-02T17:21:25Z","published":"2023-10-02T17:21:25Z","title":"NEUCORE: Neural Concept Reasoning for Composed Image Retrieval","summary":"  Composed image retrieval which combines a reference image and a text modifier\nto identify the desired target image is a challenging task, and requires the\nmodel to comprehend both vision and language modalities and their interactions.\nExisting approaches focus on holistic multi-modal interaction modeling, and\nignore the composed and complimentary property between the reference image and\ntext modifier. In order to better utilize the complementarity of multi-modal\ninputs for effective information fusion and retrieval, we move the multi-modal\nunderstanding to fine-granularity at concept-level, and learn the multi-modal\nconcept alignment to identify the visual location in reference or target images\ncorresponding to text modifier. Toward the end, we propose a NEUral COncept\nREasoning (NEUCORE) model which incorporates multi-modal concept alignment and\nprogressive multimodal fusion over aligned concepts. Specifically, considering\nthat text modifier may refer to semantic concepts not existing in the reference\nimage and requiring to be added into the target image, we learn the multi-modal\nconcept alignment between the text modifier and the concatenation of reference\nand target images, under multiple-instance learning framework with image and\nsentence level weak supervision. Furthermore, based on aligned concepts, to\nform discriminative fusion features of the input modalities for accurate target\nimage retrieval, we propose a progressive fusion strategy with unified\nexecution architecture instantiated by the attended language semantic concepts.\nOur proposed approach is evaluated on three datasets and achieves\nstate-of-the-art results.\n","authors":["Shu Zhao","Huijuan Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01356v1","updated":"2023-10-02T17:19:04Z","published":"2023-10-02T17:19:04Z","title":"Less is More: Toward Zero-Shot Local Scene Graph Generation via\n  Foundation Models","summary":"  Humans inherently recognize objects via selective visual perception,\ntransform specific regions from the visual field into structured symbolic\nknowledge, and reason their relationships among regions based on the allocation\nof limited attention resources in line with humans' goals. While it is\nintuitive for humans, contemporary perception systems falter in extracting\nstructural information due to the intricate cognitive abilities and commonsense\nknowledge required. To fill this gap, we present a new task called Local Scene\nGraph Generation. Distinct from the conventional scene graph generation task,\nwhich encompasses generating all objects and relationships in an image, our\nproposed task aims to abstract pertinent structural information with partial\nobjects and their relationships for boosting downstream tasks that demand\nadvanced comprehension and reasoning capabilities. Correspondingly, we\nintroduce zEro-shot Local scEne GrAph geNeraTion (ELEGANT), a framework\nharnessing foundation models renowned for their powerful perception and\ncommonsense reasoning, where collaboration and information communication among\nfoundation models yield superior outcomes and realize zero-shot local scene\ngraph generation without requiring labeled supervision. Furthermore, we propose\na novel open-ended evaluation metric, Entity-level CLIPScorE (ECLIPSE),\nsurpassing previous closed-set evaluation metrics by transcending their limited\nlabel space, offering a broader assessment. Experiment results show that our\napproach markedly outperforms baselines in the open-ended evaluation setting,\nand it also achieves a significant performance boost of up to 24.58% over prior\nmethods in the close-set setting, demonstrating the effectiveness and powerful\nreasoning ability of our proposed framework.\n","authors":["Shu Zhao","Huijuan Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01351v1","updated":"2023-10-02T17:13:16Z","published":"2023-10-02T17:13:16Z","title":"Streaming Motion Forecasting for Autonomous Driving","summary":"  Trajectory forecasting is a widely-studied problem for autonomous navigation.\nHowever, existing benchmarks evaluate forecasting based on independent\nsnapshots of trajectories, which are not representative of real-world\napplications that operate on a continuous stream of data. To bridge this gap,\nwe introduce a benchmark that continuously queries future trajectories on\nstreaming data and we refer to it as \"streaming forecasting.\" Our benchmark\ninherently captures the disappearance and re-appearance of agents, presenting\nthe emergent challenge of forecasting for occluded agents, which is a\nsafety-critical problem yet overlooked by snapshot-based benchmarks. Moreover,\nforecasting in the context of continuous timestamps naturally asks for temporal\ncoherence between predictions from adjacent timestamps. Based on this\nbenchmark, we further provide solutions and analysis for streaming forecasting.\nWe propose a plug-and-play meta-algorithm called \"Predictive Streamer\" that can\nadapt any snapshot-based forecaster into a streaming forecaster. Our algorithm\nestimates the states of occluded agents by propagating their positions with\nmulti-modal trajectories, and leverages differentiable filters to ensure\ntemporal consistency. Both occlusion reasoning and temporal coherence\nstrategies significantly improve forecasting quality, resulting in 25% smaller\nendpoint errors for occluded agents and 10-20% smaller fluctuations of\ntrajectories. Our work is intended to generate interest within the community by\nhighlighting the importance of addressing motion forecasting in its intrinsic\nstreaming setting. Code is available at\nhttps://github.com/ziqipang/StreamingForecasting.\n","authors":["Ziqi Pang","Deva Ramanan","Mengtian Li","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01351v1.pdf","comment":"IROS 2023, 8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.01330v1","updated":"2023-10-02T16:48:50Z","published":"2023-10-02T16:48:50Z","title":"Towards reporting bias in visual-language datasets: bimodal augmentation\n  by decoupling object-attribute association","summary":"  Reporting bias arises when people assume that some knowledge is universally\nunderstood and hence, do not necessitate explicit elaboration. In this paper,\nwe focus on the wide existence of reporting bias in visual-language datasets,\nembodied as the object-attribute association, which can subsequentially degrade\nmodels trained on them. To mitigate this bias, we propose a bimodal\naugmentation (BiAug) approach through object-attribute decoupling to flexibly\nsynthesize visual-language examples with a rich array of object-attribute\npairing and construct cross-modal hard negatives. We employ large language\nmodels (LLMs) in conjunction with a grounding object detector to extract target\nobjects. Subsequently, the LLM generates a detailed attribute description for\neach object and produces a corresponding hard negative counterpart. An\ninpainting model is then used to create images based on these detailed object\ndescriptions. By doing so, the synthesized examples explicitly complement\nomitted objects and attributes to learn, and the hard negative pairs steer the\nmodel to distinguish object attributes. Our experiments demonstrated that BiAug\nis superior in object-attribute understanding. In addition, BiAug also improves\nthe performance on zero-shot retrieval tasks on general benchmarks like MSCOCO\nand Flickr30K. BiAug refines the way of collecting text-image datasets.\nMitigating the reporting bias helps models achieve a deeper understanding of\nvisual-language phenomena, expanding beyond mere frequent patterns to encompass\nthe richness and diversity of real-world scenarios.\n","authors":["Qiyu Wu","Mengjie Zhao","Yutong He","Lang Huang","Junya Ono","Hiromi Wakaki","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2310.01330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01324v1","updated":"2023-10-02T16:41:20Z","published":"2023-10-02T16:41:20Z","title":"ZeroI2V: Zero-Cost Adaptation of Pre-trained Transformers from Image to\n  Video","summary":"  Adapting image models to video domain is becoming an efficient paradigm for\nsolving video recognition tasks. Due to the huge number of parameters and\neffective transferability of image models, performing full fine-tuning is less\nefficient and even unnecessary. Thus, recent research is shifting its focus\ntowards parameter-efficient image-to-video adaptation. However, these\nadaptation strategies inevitably introduce extra computational cost to deal\nwith the domain gap and temporal modeling in videos. In this paper, our goal is\nto present a zero-cost adaptation paradigm (ZeroI2V) to transfer the image\ntransformers to video recognition tasks (i.e., introduce zero extra cost to the\nadapted models during inference). To achieve this goal, we present two core\ndesigns. First, to capture the dynamics in videos and reduce the difficulty of\nachieving image-to-video adaptation, we exploit the flexibility of\nself-attention and introduce the spatial-temporal dual-headed attention (STDHA)\nthat efficiently endow the image transformers with temporal modeling capability\nat zero extra parameters and computation. Second, to handle the domain gap\nbetween images and videos, we propose a linear adaption strategy which utilizes\nlightweight densely placed linear adapters to fully transfer the frozen image\nmodels to video recognition. Due to its customized linear design, all newly\nadded adapters could be easily merged with the original modules through\nstructural reparameterization after training, thus achieving zero extra cost\nduring inference. Extensive experiments on four widely-used video recognition\nbenchmarks show that our ZeroI2V can match or even outperform previous\nstate-of-the-art methods while enjoying superior parameter and inference\nefficiency.\n","authors":["Xinhao Li","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01321v1","updated":"2023-10-02T16:29:49Z","published":"2023-10-02T16:29:49Z","title":"Color and Texture Dual Pipeline Lightweight Style Transfer","summary":"  Style transfer methods typically generate a single stylized output of color\nand texture coupling for reference styles, and color transfer schemes may\nintroduce distortion or artifacts when processing reference images with\nduplicate textures. To solve the problem, we propose a Color and Texture Dual\nPipeline Lightweight Style Transfer CTDP method, which employs a dual pipeline\nmethod to simultaneously output the results of color and texture transfer.\nFurthermore, we designed a masked total variation loss to suppress artifacts\nand small texture representations in color transfer results without affecting\nthe semantic part of the content. More importantly, we are able to add texture\nstructures with controllable intensity to color transfer results for the first\ntime. Finally, we conducted feature visualization analysis on the texture\ngeneration mechanism of the framework and found that smoothing the input image\ncan almost completely eliminate this texture structure. In comparative\nexperiments, the color and texture transfer results generated by CTDP both\nachieve state-of-the-art performance. Additionally, the weight of the color\ntransfer branch model size is as low as 20k, which is 100-1500 times smaller\nthan that of other state-of-the-art models.\n","authors":["ShiQi Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.01321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01292v1","updated":"2023-10-02T15:46:59Z","published":"2023-10-02T15:46:59Z","title":"Efficient Remote Sensing Segmentation With Generative Adversarial\n  Transformer","summary":"  Most deep learning methods that achieve high segmentation accuracy require\ndeep network architectures that are too heavy and complex to run on embedded\ndevices with limited storage and memory space. To address this issue, this\npaper proposes an efficient Generative Adversarial Transfomer (GATrans) for\nachieving high-precision semantic segmentation while maintaining an extremely\nefficient size. The framework utilizes a Global Transformer Network (GTNet) as\nthe generator, efficiently extracting multi-level features through residual\nconnections. GTNet employs global transformer blocks with progressively linear\ncomputational complexity to reassign global features based on a learnable\nsimilarity function. To focus on object-level and pixel-level information, the\nGATrans optimizes the objective function by combining structural similarity\nlosses. We validate the effectiveness of our approach through extensive\nexperiments on the Vaihingen dataset, achieving an average F1 score of 90.17%\nand an overall accuracy of 91.92%.\n","authors":["Luyi Qiu","Dayu Yu","Xiaofeng Zhang","Chenxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01291v1","updated":"2023-10-02T15:46:25Z","published":"2023-10-02T15:46:25Z","title":"3DHR-Co: A Collaborative Test-time Refinement Framework for In-the-Wild\n  3D Human-Body Reconstruction Task","summary":"  The field of 3D human-body reconstruction (abbreviated as 3DHR) that utilizes\nparametric pose and shape representations has witnessed significant\nadvancements in recent years. However, the application of 3DHR techniques to\nhandle real-world, diverse scenes, known as in-the-wild data, still faces\nlimitations. The primary challenge arises as curating accurate 3D human pose\nground truth (GT) for in-the-wild scenes is still difficult to obtain due to\nvarious factors. Recent test-time refinement approaches on 3DHR leverage\ninitial 2D off-the-shelf human keypoints information to support the lack of 3D\nsupervision on in-the-wild data. However, we observed that additional 2D\nsupervision alone could cause the overfitting issue on common 3DHR backbones,\nmaking the 3DHR test-time refinement task seem intractable. We answer this\nchallenge by proposing a strategy that complements 3DHR test-time refinement\nwork under a collaborative approach. Specifically, we initially apply a\npre-adaptation approach that works by collaborating various 3DHR models in a\nsingle framework to directly improve their initial outputs. This approach is\nthen further combined with the test-time adaptation work under specific\nsettings that minimize the overfitting issue to further boost the 3DHR\nperformance. The whole framework is termed as 3DHR-Co, and on the experiment\nsides, we showed that the proposed work can significantly enhance the scores of\ncommon classic 3DHR backbones up to -34 mm pose error suppression, putting them\namong the top list on the in-the-wild benchmark data. Such achievement shows\nthat our approach helps unveil the true potential of the common classic 3DHR\nbackbones. Based on these findings, we further investigate various settings on\nthe proposed framework to better elaborate the capability of our collaborative\napproach in the 3DHR task.\n","authors":["Jonathan Samuel Lumentut","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2310.01291v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.01288v1","updated":"2023-10-02T15:41:35Z","published":"2023-10-02T15:41:35Z","title":"Offline Tracking with Object Permanence","summary":"  To reduce the expensive labor cost for manual labeling autonomous driving\ndatasets, an alternative is to automatically label the datasets using an\noffline perception system. However, objects might be temporally occluded. Such\nocclusion scenarios in the datasets are common yet underexplored in offline\nautolabeling. In this work, we propose an offline tracking model that focuses\non occluded object tracks. It leverages the concept of object permanence which\nmeans objects continue to exist even if they are not observed anymore. The\nmodel contains three parts: a standard online tracker, a re-identification\n(Re-ID) module that associates tracklets before and after occlusion, and a\ntrack completion module that completes the fragmented tracks. The Re-ID module\nand the track completion module use the vectorized map as one of the inputs to\nrefine the tracking results with occlusion. The model can effectively recover\nthe occluded object trajectories. It achieves state-of-the-art performance in\n3D multi-object tracking by improving over the original online tracking result\nby 45% IDS and 2% AMOTA on the vehicle tracks.\n","authors":["Xianzhong Liu","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2310.01288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01258v1","updated":"2023-10-02T14:50:14Z","published":"2023-10-02T14:50:14Z","title":"MobileNVC: Real-time 1080p Neural Video Compression on a Mobile Device","summary":"  Neural video codecs have recently become competitive with standard codecs\nsuch as HEVC in the low-delay setting. However, most neural codecs are large\nfloating-point networks that use pixel-dense warping operations for temporal\nmodeling, making them too computationally expensive for deployment on mobile\ndevices. Recent work has demonstrated that running a neural decoder in real\ntime on mobile is feasible, but shows this only for 720p RGB video, while the\nYUV420 format is more commonly used in production. This work presents the first\nneural video codec that decodes 1080p YUV420 video in real time on a mobile\ndevice. Our codec relies on two major contributions. First, we design an\nefficient codec that uses a block-based motion compensation algorithm available\non the warping core of the mobile accelerator, and we show how to quantize this\nmodel to integer precision. Second, we implement a fast decoder pipeline that\nconcurrently runs neural network components on the neural signal processor,\nparallel entropy coding on the mobile GPU, and warping on the warping core. Our\ncodec outperforms the previous on-device codec by a large margin with up to 48\n% BD-rate savings, while reducing the MAC count on the receiver side by 10x. We\nperform a careful ablation to demonstrate the effect of the introduced motion\ncompensation scheme, and ablate the effect of model quantization.\n","authors":["Ties van Rozendaal","Tushar Singhal","Hoang Le","Guillaume Sautiere","Amir Said","Krishna Buska","Anjuman Raha","Dimitris Kalatzis","Hitarth Mehta","Frank Mayer","Liang Zhang","Markus Nagel","Auke Wiggers"],"pdf_url":"https://arxiv.org/pdf/2310.01258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01251v1","updated":"2023-10-02T14:39:10Z","published":"2023-10-02T14:39:10Z","title":"Generating 3D Brain Tumor Regions in MRI using Vector-Quantization\n  Generative Adversarial Networks","summary":"  Medical image analysis has significantly benefited from advancements in deep\nlearning, particularly in the application of Generative Adversarial Networks\n(GANs) for generating realistic and diverse images that can augment training\ndatasets. However, the effectiveness of such approaches is often limited by the\namount of available data in clinical settings. Additionally, the common\nGAN-based approach is to generate entire image volumes, rather than solely the\nregion of interest (ROI). Research on deep learning-based brain tumor\nclassification using MRI has shown that it is easier to classify the tumor ROIs\ncompared to the entire image volumes. In this work, we present a novel\nframework that uses vector-quantization GAN and a transformer incorporating\nmasked token modeling to generate high-resolution and diverse 3D brain tumor\nROIs that can be directly used as augmented data for the classification of\nbrain tumor ROI. We apply our method to two imbalanced datasets where we\naugment the minority class: (1) the Multimodal Brain Tumor Segmentation\nChallenge (BraTS) 2019 dataset to generate new low-grade glioma (LGG) ROIs to\nbalance with high-grade glioma (HGG) class; (2) the internal pediatric LGG\n(pLGG) dataset tumor ROIs with BRAF V600E Mutation genetic marker to balance\nwith BRAF Fusion genetic marker class. We show that the proposed method\noutperforms various baseline models in both qualitative and quantitative\nmeasurements. The generated data was used to balance the data in the brain\ntumor types classification task. Using the augmented data, our approach\nsurpasses baseline models by 6.4% in AUC on the BraTS 2019 dataset and 4.3% in\nAUC on our internal pLGG dataset. The results indicate the generated tumor ROIs\ncan effectively address the imbalanced data problem. Our proposed method has\nthe potential to facilitate an accurate diagnosis of rare brain tumors using\nMRI scans.\n","authors":["Meng Zhou","Matthias W Wagner","Uri Tabori","Cynthia Hawkins","Birgit B Ertl-Wagner","Farzad Khalvati"],"pdf_url":"https://arxiv.org/pdf/2310.01251v1.pdf","comment":"Preprint, In Submission"},{"id":"http://arxiv.org/abs/2310.01236v1","updated":"2023-10-02T14:26:31Z","published":"2023-10-02T14:26:31Z","title":"Mirror Diffusion Models for Constrained and Watermarked Generation","summary":"  Modern successes of diffusion models in learning complex, high-dimensional\ndata distributions are attributed, in part, to their capability to construct\ndiffusion processes with analytic transition kernels and score functions. The\ntractability results in a simulation-free framework with stable regression\nlosses, from which reversed, generative processes can be learned at scale.\nHowever, when data is confined to a constrained set as opposed to a standard\nEuclidean space, these desirable characteristics appear to be lost based on\nprior attempts. In this work, we propose Mirror Diffusion Models (MDM), a new\nclass of diffusion models that generate data on convex constrained sets without\nlosing any tractability. This is achieved by learning diffusion processes in a\ndual space constructed from a mirror map, which, crucially, is a standard\nEuclidean space. We derive efficient computation of mirror maps for popular\nconstrained sets, such as simplices and $\\ell_2$-balls, showing significantly\nimproved performance of MDM over existing methods. For safety and privacy\npurposes, we also explore constrained sets as a new mechanism to embed\ninvisible but quantitative information (i.e., watermarks) in generated data,\nfor which MDM serves as a compelling approach. Our work brings new algorithmic\nopportunities for learning tractable diffusion on complex domains.\n","authors":["Guan-Horng Liu","Tianrong Chen","Evangelos A. Theodorou","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2310.01236v1.pdf","comment":"submitted to NeurIPS on 5/18 but did not arxiv per NeurIPS policy,\n  accepted on 9/22"},{"id":"http://arxiv.org/abs/2310.01228v1","updated":"2023-10-02T14:16:13Z","published":"2023-10-02T14:16:13Z","title":"Reconstructing 3D Human Pose from RGB-D Data with Occlusions","summary":"  We propose a new method to reconstruct the 3D human body from RGB-D images\nwith occlusions. The foremost challenge is the incompleteness of the RGB-D data\ndue to occlusions between the body and the environment, leading to implausible\nreconstructions that suffer from severe human-scene penetration. To reconstruct\na semantically and physically plausible human body, we propose to reduce the\nsolution space based on scene information and prior knowledge. Our key idea is\nto constrain the solution space of the human body by considering the occluded\nbody parts and visible body parts separately: modeling all plausible poses\nwhere the occluded body parts do not penetrate the scene, and constraining the\nvisible body parts using depth data. Specifically, the first component is\nrealized by a neural network that estimates the candidate region named the\n\"free zone\", a region carved out of the open space within which it is safe to\nsearch for poses of the invisible body parts without concern for penetration.\nThe second component constrains the visible body parts using the \"truncated\nshadow volume\" of the scanned body point cloud. Furthermore, we propose to use\na volume matching strategy, which yields better performance than surface\nmatching, to match the human body with the confined region. We conducted\nexperiments on the PROX dataset, and the results demonstrate that our method\nproduces more accurate and plausible results compared with other methods.\n","authors":["Bowen Dang","Xi Zhao","Bowen Zhang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01228v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2306.08018v3","updated":"2023-10-02T15:27:20Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n  Large Language Models","summary":"  Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a comprehensive instruction dataset\ndesigned for the biomolecular domain. Mol-Instructions encompasses three key\ncomponents: molecule-oriented instructions, protein-oriented instructions, and\nbiomolecular text instructions. Each component aims to improve the\nunderstanding and prediction capabilities of LLMs concerning biomolecular\nfeatures and behaviors. Through extensive instruction tuning experiments on\nLLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large\nmodels' performance in the intricate realm of biomolecular studies, thus\nfostering progress in the biomolecular research community. Mol-Instructions is\npublicly available for ongoing research and will undergo regular updates to\nenhance its applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v3.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions"},{"id":"http://arxiv.org/abs/2310.01696v1","updated":"2023-10-02T23:23:00Z","published":"2023-10-02T23:23:00Z","title":"DANI: Fast Diffusion Aware Network Inference with Preserving Topological\n  Structure Property","summary":"  The fast growth of social networks and their data access limitations in\nrecent years has led to increasing difficulty in obtaining the complete\ntopology of these networks. However, diffusion information over these networks\nis available, and many algorithms have been proposed to infer the underlying\nnetworks using this information. The previously proposed algorithms only focus\non inferring more links and ignore preserving the critical topological\ncharacteristics of the underlying social networks. In this paper, we propose a\nnovel method called DANI to infer the underlying network while preserving its\nstructural properties. It is based on the Markov transition matrix derived from\ntime series cascades, as well as the node-node similarity that can be observed\nin the cascade behavior from a structural point of view. In addition, the\npresented method has linear time complexity (increases linearly with the number\nof nodes, number of cascades, and square of the average length of cascades),\nand its distributed version in the MapReduce framework is also scalable. We\napplied the proposed approach to both real and synthetic networks. The\nexperimental results showed that DANI has higher accuracy and lower run time\nwhile maintaining structural properties, including modular structure, degree\ndistribution, connected components, density, and clustering coefficients, than\nwell-known network inference methods.\n","authors":["Maryam Ramezani","Aryan Ahadinia","Erfan Farhadi","Hamid R. Rabiee"],"pdf_url":"https://arxiv.org/pdf/2310.01696v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:1706.00941"},{"id":"http://arxiv.org/abs/2310.01612v1","updated":"2023-10-02T20:03:42Z","published":"2023-10-02T20:03:42Z","title":"Towards Efficient and Effective Adaptation of Large Language Models for\n  Sequential Recommendation","summary":"  In recent years, with large language models (LLMs) achieving state-of-the-art\nperformance in context understanding, increasing efforts have been dedicated to\ndeveloping LLM-enhanced sequential recommendation (SR) methods. Considering\nthat most existing LLMs are not specifically optimized for recommendation\ntasks, adapting them for SR becomes a critical step in LLM-enhanced SR methods.\nThough numerous adaptation methods have been developed, it still remains a\nsignificant challenge to adapt LLMs for SR both efficiently and effectively. To\naddress this challenge, in this paper, we introduce a novel side sequential\nnetwork adaptation method, denoted as SSNA, for LLM enhanced SR. SSNA features\nthree key designs to allow both efficient and effective LLM adaptation. First,\nSSNA learns adapters separate from LLMs, while fixing all the pre-trained\nparameters within LLMs to allow efficient adaptation. In addition, SSNA adapts\nthe top-a layers of LLMs jointly, and integrates adapters sequentially for\nenhanced effectiveness (i.e., recommendation performance). We compare SSNA\nagainst five state-of-the-art baseline methods on five benchmark datasets using\nthree LLMs. The experimental results demonstrate that SSNA significantly\noutperforms all the baseline methods in terms of recommendation performance,\nand achieves substantial improvement over the best-performing baseline methods\nat both run-time and memory efficiency during training. Our analysis shows the\neffectiveness of integrating adapters in a sequential manner. Our parameter\nstudy demonstrates the effectiveness of jointly adapting the top-a layers of\nLLMs.\n","authors":["Bo Peng","Ben Burns","Ziqi Chen","Srinivasan Parthasarathy","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2310.01612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14662v2","updated":"2023-10-02T19:25:51Z","published":"2023-09-26T04:36:12Z","title":"Transformer-based classification of user queries for medical consultancy\n  with respect to expert specialization","summary":"  The need for skilled medical support is growing in the era of digital\nhealthcare. This research presents an innovative strategy, utilizing the RuBERT\nmodel, for categorizing user inquiries in the field of medical consultation\nwith a focus on expert specialization. By harnessing the capabilities of\ntransformers, we fine-tuned the pre-trained RuBERT model on a varied dataset,\nwhich facilitates precise correspondence between queries and particular medical\nspecialisms. Using a comprehensive dataset, we have demonstrated our approach's\nsuperior performance with an F1-score of over 92%, calculated through both\ncross-validation and the traditional split of test and train datasets. Our\napproach has shown excellent generalization across medical domains such as\ncardiology, neurology and dermatology. This methodology provides practical\nbenefits by directing users to appropriate specialists for prompt and targeted\nmedical advice. It also enhances healthcare system efficiency, reduces\npractitioner burden, and improves patient care quality. In summary, our\nsuggested strategy facilitates the attainment of specific medical knowledge,\noffering prompt and precise advice within the digital healthcare field.\n","authors":["Dmitry Lyutkin","Andrey Soloviev","Dmitry Zhukov","Denis Pozdnyakov","Muhammad Shahid Iqbal Malik","Dmitry I. Ignatov"],"pdf_url":"https://arxiv.org/pdf/2309.14662v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.01565v1","updated":"2023-10-02T18:56:05Z","published":"2023-10-02T18:56:05Z","title":"Causality-informed Rapid Post-hurricane Building Damage Detection in\n  Large Scale from InSAR Imagery","summary":"  Timely and accurate assessment of hurricane-induced building damage is\ncrucial for effective post-hurricane response and recovery efforts. Recently,\nremote sensing technologies provide large-scale optical or Interferometric\nSynthetic Aperture Radar (InSAR) imagery data immediately after a disastrous\nevent, which can be readily used to conduct rapid building damage assessment.\nCompared to optical satellite imageries, the Synthetic Aperture Radar can\npenetrate cloud cover and provide more complete spatial coverage of damaged\nzones in various weather conditions. However, these InSAR imageries often\ncontain highly noisy and mixed signals induced by co-occurring or co-located\nbuilding damage, flood, flood/wind-induced vegetation changes, as well as\nanthropogenic activities, making it challenging to extract accurate building\ndamage information. In this paper, we introduced an approach for rapid\npost-hurricane building damage detection from InSAR imagery. This approach\nencoded complex causal dependencies among wind, flood, building damage, and\nInSAR imagery using a holistic causal Bayesian network. Based on the causal\nBayesian network, we further jointly inferred the large-scale unobserved\nbuilding damage by fusing the information from InSAR imagery with prior\nphysical models of flood and wind, without the need for ground truth labels.\nFurthermore, we validated our estimation results in a real-world devastating\nhurricane -- the 2022 Hurricane Ian. We gathered and annotated building damage\nground truth data in Lee County, Florida, and compared the introduced method's\nestimation results with the ground truth and benchmarked it against\nstate-of-the-art models to assess the effectiveness of our proposed method.\nResults show that our method achieves rapid and accurate detection of building\ndamage, with significantly reduced processing time compared to traditional\nmanual inspection methods.\n","authors":["Chenguang Wang","Yepeng Liu","Xiaojian Zhang","Xuechun Li","Vladimir Paramygin","Arthriya Subgranon","Peter Sheng","Xilei Zhao","Susu Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01565v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.01507v1","updated":"2023-10-02T18:02:37Z","published":"2023-10-02T18:02:37Z","title":"Replicating Relevance-Ranked Synonym Discovery in a New Language and\n  Domain","summary":"  Domain-specific synonyms occur in many specialized search tasks, such as when\nsearching medical documents, legal documents, and software engineering\nartifacts. We replicate prior work on ranking domain-specific synonyms in the\nconsumer health domain by applying the approach to a new language and domain:\nidentifying Swedish language synonyms in the building construction domain. We\nchose this setting because identifying synonyms in this domain is helpful for\ndownstream systems, where different users may query for documents (e.g.,\nengineering requirements) using different terminology. We consider two new\nfeatures inspired by the change in language and methodological advances since\nthe prior work's publication. An evaluation using data from the building\nconstruction domain supports the finding from the prior work that synonym\ndiscovery is best approached as a learning to rank task in which a human editor\nviews ranked synonym candidates in order to construct a domain-specific\nthesaurus. We additionally find that FastText embeddings alone provide a strong\nbaseline, though they do not perform as well as the strongest learning to rank\nmethod. Finally, we analyze the performance of individual features and the\ndifferences in the domains.\n","authors":["Andrew Yates","Michael Unterkalmsteiner"],"pdf_url":"https://arxiv.org/pdf/2310.01507v1.pdf","comment":"ECIR (1) 2019: 429-442"},{"id":"http://arxiv.org/abs/2310.01271v1","updated":"2023-10-02T15:16:31Z","published":"2023-10-02T15:16:31Z","title":"LEEC: A Legal Element Extraction Dataset with an Extensive\n  Domain-Specific Label System","summary":"  As a pivotal task in natural language processing, element extraction has\ngained significance in the legal domain. Extracting legal elements from\njudicial documents helps enhance interpretative and analytical capacities of\nlegal cases, and thereby facilitating a wide array of downstream applications\nin various domains of law. Yet existing element extraction datasets are limited\nby their restricted access to legal knowledge and insufficient coverage of\nlabels. To address this shortfall, we introduce a more comprehensive,\nlarge-scale criminal element extraction dataset, comprising 15,831 judicial\ndocuments and 159 labels. This dataset was constructed through two main steps:\nFirst, designing the label system by our team of legal experts based on prior\nlegal research which identified critical factors driving and processes\ngenerating sentencing outcomes in criminal cases; Second, employing the legal\nknowledge to annotate judicial documents according to the label system and\nannotation guideline. The Legal Element ExtraCtion dataset (LEEC) represents\nthe most extensive and domain-specific legal element extraction dataset for the\nChinese legal system. Leveraging the annotated data, we employed various SOTA\nmodels that validates the applicability of LEEC for Document Event Extraction\n(DEE) task. The LEEC dataset is available on https://github.com/THUlawtech/LEEC .\n","authors":["Xue Zongyue","Liu Huanghai","Hu Yiran","Kong Kangle","Wang Chenlu","Liu Yun","Shen Weixing"],"pdf_url":"https://arxiv.org/pdf/2310.01271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01146v1","updated":"2023-10-02T12:33:01Z","published":"2023-10-02T12:33:01Z","title":"NewsRecLib: A PyTorch-Lightning Library for Neural News Recommendation","summary":"  NewsRecLib is an open-source library based on Pytorch-Lightning and Hydra\ndeveloped for training and evaluating neural news recommendation models. The\nforemost goals of NewsRecLib are to promote reproducible research and rigorous\nexperimental evaluation by (i) providing a unified and highly configurable\nframework for exhaustive experimental studies and (ii) enabling a thorough\nanalysis of the performance contribution of different model architecture\ncomponents and training regimes. NewsRecLib is highly modular, allows\nspecifying experiments in a single configuration file, and includes extensive\nlogging facilities. Moreover, NewsRecLib provides out-of-the-box\nimplementations of several prominent neural models, training methods, standard\nevaluation benchmarks, and evaluation metrics for news recommendation.\n","authors":["Andreea Iana","Goran Glavaš","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2310.01146v1.pdf","comment":"Accepted at the 2023 Conference on Empirical Methods in Natural\n  Language Processing (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2310.01038v1","updated":"2023-10-02T09:30:11Z","published":"2023-10-02T09:30:11Z","title":"Dataset Condensation for Recommendation","summary":"  Training recommendation models on large datasets often requires significant\ntime and computational resources. Consequently, an emergent imperative has\narisen to construct informative, smaller-scale datasets for efficiently\ntraining. Dataset compression techniques explored in other domains show\npotential possibility to address this problem, via sampling a subset or\nsynthesizing a small dataset. However, applying existing approaches to condense\nrecommendation datasets is impractical due to following challenges: (i)\nsampling-based methods are inadequate in addressing the long-tailed\ndistribution problem; (ii) synthesizing-based methods are not applicable due to\ndiscreteness of interactions and large size of recommendation datasets; (iii)\nneither of them fail to address the specific issue in recommendation of false\nnegative items, where items with potential user interest are incorrectly\nsampled as negatives owing to insufficient exposure.\n  To bridge this gap, we investigate dataset condensation for recommendation,\nwhere discrete interactions are continualized with probabilistic\nre-parameterization. To avoid catastrophically expensive computations, we adopt\na one-step update strategy for inner model training and introducing policy\ngradient estimation for outer dataset synthesis. To mitigate amplification of\nlong-tailed problem, we compensate long-tailed users in the condensed dataset.\nFurthermore, we propose to utilize a proxy model to identify false negative\nitems. Theoretical analysis regarding the convergence property is provided.\nExtensive experiments on multiple datasets demonstrate the efficacy of our\nmethod. In particular, we reduce the dataset size by 75% while approximating\nover 98% of the original performance on Dianping and over 90% on other\ndatasets.\n","authors":["Jiahao Wu","Wenqi Fan","Shengcai Liu","Qijiong Liu","Rui He","Qing Li","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2310.01038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00896v1","updated":"2023-10-02T04:26:07Z","published":"2023-10-02T04:26:07Z","title":"Organized Event Participant Prediction Enhanced by Social Media\n  Retweeting Data","summary":"  Nowadays, many platforms on the Web offer organized events, allowing users to\nbe organizers or participants. For such platforms, it is beneficial to predict\npotential event participants. Existing work on this problem tends to borrow\nrecommendation techniques. However, compared to e-commerce items and purchases,\nevents and participation are usually of a much smaller frequency, and the data\nmay be insufficient to learn an accurate model. In this paper, we propose to\nutilize social media retweeting activity data to enhance the learning of event\nparticipant prediction models. We create a joint knowledge graph to bridge the\nsocial media and the target domain, assuming that event descriptions and tweets\nare written in the same language. Furthermore, we propose a learning model that\nutilizes retweeting information for the target domain prediction more\neffectively. We conduct comprehensive experiments in two scenarios with\nreal-world data. In each scenario, we set up training data of different sizes,\nas well as warm and cold test cases. The evaluation results show that our\napproach consistently outperforms several baseline models, especially with the\nwarm test cases, and when target domain data is limited.\n","authors":["Yihong Zhang","Takahiro Hara"],"pdf_url":"https://arxiv.org/pdf/2310.00896v1.pdf","comment":"Accepted in WI-IAT 2023"},{"id":"http://arxiv.org/abs/2310.00870v1","updated":"2023-10-02T03:19:45Z","published":"2023-10-02T03:19:45Z","title":"F0 analysis of Ghanaian pop singing reveals progressive alignment with\n  equal temperament over the past three decades: a case study","summary":"  Contemporary Ghanaian popular singing combines European and traditional\nGhanaian influences. We hypothesize that access to technology embedded with\nequal temperament catalyzed a progressive alignment of Ghanaian singing with\nequal-tempered scales over time. To test this, we study the Ghanaian singer\nDaddy Lumba, whose work spans from the earliest Ghanaian electronic style in\nthe late 1980s to the present. Studying a singular musician as a case study\nallows us to refine our analysis without over-interpreting the findings. We\ncurated a collection of his songs, distributed between 1989 and 2016, to\nextract F0 values from isolated vocals. We used Gaussian mixture modeling (GMM)\nto approximate each song's scale and found that the pitch variance has been\ndecreasing over time. We also determined whether the GMM components follow the\narithmetic relationships observed in equal-tempered scales, and observed that\nDaddy Lumba's singing better aligns with equal temperament in recent years.\nTogether, results reveal the impact of exposure to equal-tempered scales,\nresulting in lessened microtonal content in Daddy Lumba's singing. Our study\nhighlights a potential vulnerability of Ghanaian musical scales and implies a\nneed for research that maps and archives singing styles.\n","authors":["Iran R. Roman","Daniel Faronbi","Isabelle Burger-Weiser","Leila Adu-Gilmore"],"pdf_url":"https://arxiv.org/pdf/2310.00870v1.pdf","comment":"Pages 27-33"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.15188v2","updated":"2023-10-02T17:48:04Z","published":"2023-09-26T18:49:30Z","title":"ICML 2023 Topological Deep Learning Challenge : Design and Results","summary":"  This paper presents the computational challenge on topological deep learning\nthat was hosted within the ICML 2023 Workshop on Topology and Geometry in\nMachine Learning. The competition asked participants to provide open-source\nimplementations of topological neural networks from the literature by\ncontributing to the python packages TopoNetX (data processing) and TopoModelX\n(deep learning). The challenge attracted twenty-eight qualifying submissions in\nits two-month duration. This paper describes the design of the challenge and\nsummarizes its main findings.\n","authors":["Mathilde Papillon","Mustafa Hajij","Florian Frantzen","Josef Hoppe","Helen Jenne","Johan Mathe","Audun Myers","Theodore Papamarkou","Michael T. Schaub","Ghada Zamzmi","Tolga Birdal","Tamal Dey","Tim Doster","Tegan Emerson","Gurusankar Gopalakrishnan","Devendra Govil","Vincent Grande","Aldo Guzmán-Sáenz","Henry Kvinge","Neal Livesay","Jan Meisner","Soham Mukherjee","Shreyas N. Samaga","Karthikeyan Natesan Ramamurthy","Maneel Reddy Karri","Paul Rosen","Sophia Sanborn","Michael Scholkemper","Robin Walters","Jens Agerberg","Georg Bökman","Sadrodin Barikbin","Claudio Battiloro","Gleb Bazhenov","Guillermo Bernardez","Aiden Brent","Sergio Escalera","Simone Fiorellino","Dmitrii Gavrilev","Mohammed Hassanin","Paul Häusner","Odin Hoff Gardaa","Abdelwahed Khamis","Manuel Lecha","German Magai","Tatiana Malygina","Pavlo Melnyk","Rubén Ballester","Kalyan Nadimpalli","Alexander Nikitin","Abraham Rabinowitz","Alessandro Salatiello","Simone Scardapane","Luca Scofano","Suraj Singh","Jens Sjölund","Pavel Snopov","Indro Spinelli","Lev Telyatnikov","Lucia Testa","Maosheng Yang","Yixiao Yue","Olga Zaghen","Ali Zia","Nina Miolane"],"pdf_url":"https://arxiv.org/pdf/2309.15188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11819v3","updated":"2023-10-02T17:46:40Z","published":"2023-08-22T22:43:20Z","title":"A Counterfactual Fair Model for Longitudinal Electronic Health Records\n  via Deconfounder","summary":"  The fairness issue of clinical data modeling, especially on Electronic Health\nRecords (EHRs), is of utmost importance due to EHR's complex latent structure\nand potential selection bias. It is frequently necessary to mitigate health\ndisparity while keeping the model's overall accuracy in practice. However,\ntraditional methods often encounter the trade-off between accuracy and\nfairness, as they fail to capture the underlying factors beyond observed data.\nTo tackle this challenge, we propose a novel model called Fair Longitudinal\nMedical Deconfounder (FLMD) that aims to achieve both fairness and accuracy in\nlongitudinal Electronic Health Records (EHR) modeling. Drawing inspiration from\nthe deconfounder theory, FLMD employs a two-stage training process. In the\nfirst stage, FLMD captures unobserved confounders for each encounter, which\neffectively represents underlying medical factors beyond observed EHR, such as\npatient genotypes and lifestyle habits. This unobserved confounder is crucial\nfor addressing the accuracy/fairness dilemma. In the second stage, FLMD\ncombines the learned latent representation with other relevant features to make\npredictions. By incorporating appropriate fairness criteria, such as\ncounterfactual fairness, FLMD ensures that it maintains high prediction\naccuracy while simultaneously minimizing health disparities. We conducted\ncomprehensive experiments on two real-world EHR datasets to demonstrate the\neffectiveness of FLMD. Apart from the comparison of baseline methods and FLMD\nvariants in terms of fairness and accuracy, we assessed the performance of all\nmodels on disturbed/imbalanced and synthetic datasets to showcase the\nsuperiority of FLMD across different settings and provide valuable insights\ninto its capabilities.\n","authors":["Zheng Liu","Xiaohan Li","Philip Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11819v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01913v3","updated":"2023-10-02T16:59:40Z","published":"2023-01-05T05:13:48Z","title":"Learning a Generic Value-Selection Heuristic Inside a Constraint\n  Programming Solver","summary":"  Constraint programming is known for being an efficient approach for solving\ncombinatorial problems. Important design choices in a solver are the branching\nheuristics, which are designed to lead the search to the best solutions in a\nminimum amount of time. However, developing these heuristics is a\ntime-consuming process that requires problem-specific expertise. This\nobservation has motivated many efforts to use machine learning to automatically\nlearn efficient heuristics without expert intervention. To the best of our\nknowledge, it is still an open research question. Although several generic\nvariable-selection heuristics are available in the literature, the options for\na generic value-selection heuristic are more scarce. In this paper, we propose\nto tackle this issue by introducing a generic learning procedure that can be\nused to obtain a value-selection heuristic inside a constraint programming\nsolver. This has been achieved thanks to the combination of a deep Q-learning\nalgorithm, a tailored reward signal, and a heterogeneous graph neural network\narchitecture. Experiments on graph coloring, maximum independent set, and\nmaximum cut problems show that our framework is able to find better solutions\nclose to optimality without requiring a large amounts of backtracks while being\ngeneric.\n","authors":["Tom Marty","Tristan François","Pierre Tessier","Louis Gauthier","Louis-Martin Rousseau","Quentin Cappart"],"pdf_url":"https://arxiv.org/pdf/2301.01913v3.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2305.15016v2","updated":"2023-10-02T16:54:34Z","published":"2023-05-24T10:58:09Z","title":"An Unsupervised Method for Estimating Class Separability of Datasets\n  with Application to LLMs Fine-Tuning","summary":"  This paper proposes an unsupervised method that leverages topological\ncharacteristics of data manifolds to estimate class separability of the data\nwithout requiring labels. Experiments conducted in this paper on several\ndatasets demonstrate a clear correlation and consistency between the class\nseparability estimated by the proposed method with supervised metrics like\nFisher Discriminant Ratio~(FDR) and cross-validation of a classifier, which\nboth require labels. This can enable implementing learning paradigms aimed at\nlearning from both labeled and unlabeled data, like semi-supervised and\ntransductive learning. This would be particularly useful when we have limited\nlabeled data and a relatively large unlabeled dataset that can be used to\nenhance the learning process. The proposed method is implemented for language\nmodel fine-tuning with automated stopping criterion by monitoring class\nseparability of the embedding-space manifold in an unsupervised setting. The\nproposed methodology has been first validated on synthetic data, where the\nresults show a clear consistency between class separability estimated by the\nproposed method and class separability computed by FDR. The method has been\nalso implemented on both public and internal data. The results show that the\nproposed method can effectively aid -- without the need for labels -- a\ndecision on when to stop or continue the fine-tuning of a language model and\nwhich fine-tuning iteration is expected to achieve a maximum classification\nperformance through quantification of the class separability of the embedding\nmanifold.\n","authors":["Najah Ghalyan","Kostis Gourgoulias","Yash Satsangi","Sean Moran","Maxime Labonne","Joseph Sabelja"],"pdf_url":"https://arxiv.org/pdf/2305.15016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09718v2","updated":"2023-10-02T16:50:53Z","published":"2023-04-19T15:05:19Z","title":"Sample-efficient Model-based Reinforcement Learning for Quantum Control","summary":"  We propose a model-based reinforcement learning (RL) approach for noisy\ntime-dependent gate optimization with improved sample complexity over\nmodel-free RL. Sample complexity is the number of controller interactions with\nthe physical system. Leveraging an inductive bias, inspired by recent advances\nin neural ordinary differential equations (ODEs), we use an auto-differentiable\nODE parametrised by a learnable Hamiltonian ansatz to represent the model\napproximating the environment whose time-dependent part, including the control,\nis fully known. Control alongside Hamiltonian learning of continuous\ntime-independent parameters is addressed through interactions with the system.\nWe demonstrate an order of magnitude advantage in the sample complexity of our\nmethod over standard model-free RL in preparing some standard unitary gates\nwith closed and open system dynamics, in realistic numerical experiments\nincorporating single shot measurements, arbitrary Hilbert space truncations and\nuncertainty in Hamiltonian parameters. Also, the learned Hamiltonian can be\nleveraged by existing control methods like GRAPE for further gradient-based\noptimization with the controllers found by RL as initializations. Our algorithm\nthat we apply on nitrogen vacancy (NV) centers and transmons in this paper is\nwell suited for controlling partially characterised one and two qubit systems.\n","authors":["Irtaza Khalid","Carrie A. Weidner","Edmond A. Jonckheere","Sophie G. Shermer","Frank C. Langbein"],"pdf_url":"https://arxiv.org/pdf/2304.09718v2.pdf","comment":"14+10 pages, 6+6 figures, revised version"},{"id":"http://arxiv.org/abs/2306.10060v2","updated":"2023-10-02T16:44:32Z","published":"2023-06-14T13:06:04Z","title":"MUBen: Benchmarking the Uncertainty of Molecular Representation Models","summary":"  Large molecular representation models pre-trained on massive unlabeled data\nhave shown great success in predicting molecular properties. However, these\nmodels may tend to overfit the fine-tuning data, resulting in over-confident\npredictions on test data that fall outside of the training distribution. To\naddress this issue, uncertainty quantification (UQ) methods can be used to\nimprove the models' calibration of predictions. Although many UQ approaches\nexist, not all of them lead to improved performance. While some studies have\nincluded UQ to improve molecular pre-trained models, the process of selecting\nsuitable backbone and UQ methods for reliable molecular uncertainty estimation\nremains underexplored. To address this gap, we present MUBen, which evaluates\ndifferent UQ methods for state-of-the-art backbone molecular representation\nmodels to investigate their capabilities. By fine-tuning various backbones\nusing different molecular descriptors as inputs with UQ methods from different\ncategories, we critically assess the influence of architectural decisions and\ntraining strategies. Our study offers insights for selecting UQ for backbone\nmodels, which can facilitate research on uncertainty-critical applications in\nfields such as materials science and drug discovery.\n","authors":["Yinghao Li","Lingkai Kong","Yuanqi Du","Yue Yu","Yuchen Zhuang","Wenhao Mu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.10060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14945v3","updated":"2023-10-02T16:08:29Z","published":"2023-08-28T23:51:33Z","title":"Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals","summary":"  We consider the problem of sampling from a distribution governed by a\npotential function. This work proposes an explicit score based MCMC method that\nis deterministic, resulting in a deterministic evolution for particles rather\nthan a stochastic differential equation evolution. The score term is given in\nclosed form by a regularized Wasserstein proximal, using a kernel convolution\nthat is approximated by sampling. We demonstrate fast convergence on various\nproblems and show improved dimensional dependence of mixing time bounds for the\ncase of Gaussian distributions compared to the unadjusted Langevin algorithm\n(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally\nderive closed form expressions for the distributions at each iterate for\nquadratic potential functions, characterizing the variance reduction. Empirical\nresults demonstrate that the particles behave in an organized manner, lying on\nlevel set contours of the potential. Moreover, the posterior mean estimator of\nthe proposed method is shown to be closer to the maximum a-posteriori estimator\ncompared to ULA and MALA in the context of Bayesian logistic regression.\nAdditional examples demonstrate competitive performance for Bayesian neural\nnetwork training.\n","authors":["Hong Ye Tan","Stanley Osher","Wuchen Li"],"pdf_url":"https://arxiv.org/pdf/2308.14945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04536v2","updated":"2023-10-02T15:56:21Z","published":"2023-07-10T13:01:27Z","title":"DADO -- Low-Cost Query Strategies for Deep Active Design Optimization","summary":"  In this experience report, we apply deep active learning to the field of\ndesign optimization to reduce the number of computationally expensive numerical\nsimulations. We are interested in optimizing the design of structural\ncomponents, where the shape is described by a set of parameters. If we can\npredict the performance based on these parameters and consider only the\npromising candidates for simulation, there is an enormous potential for saving\ncomputing power. We present two selection strategies for self-optimization to\nreduce the computational cost in multi-objective design optimization problems.\nOur proposed methodology provides an intuitive approach that is easy to apply,\noffers significant improvements over random sampling, and circumvents the need\nfor uncertainty estimation. We evaluate our strategies on a large dataset from\nthe domain of fluid dynamics and introduce two new evaluation metrics to\ndetermine the model's performance. Findings from our evaluation highlights the\neffectiveness of our selection strategies in accelerating design optimization.\nWe believe that the introduced method is easily transferable to other\nself-optimization problems.\n","authors":["Jens Decke","Christian Gruhl","Lukas Rauch","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2307.04536v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08374v2","updated":"2023-10-02T15:55:19Z","published":"2023-09-15T13:04:11Z","title":"Understanding the limitations of self-supervised learning for tabular\n  anomaly detection","summary":"  While self-supervised learning has improved anomaly detection in computer\nvision and natural language processing, it is unclear whether tabular data can\nbenefit from it. This paper explores the limitations of self-supervision for\ntabular anomaly detection. We conduct several experiments spanning various\npretext tasks on 26 benchmark datasets to understand why this is the case. Our\nresults confirm representations derived from self-supervision do not improve\ntabular anomaly detection performance compared to using the raw representations\nof the data. We show this is due to neural networks introducing irrelevant\nfeatures, which reduces the effectiveness of anomaly detectors. However, we\ndemonstrate that using a subspace of the neural network's representation can\nrecover performance.\n","authors":["Kimberly T. Mai","Toby Davies","Lewis D. Griffin"],"pdf_url":"https://arxiv.org/pdf/2309.08374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13498v2","updated":"2023-10-02T15:46:18Z","published":"2023-08-25T17:13:42Z","title":"Escaping the Sample Trap: Fast and Accurate Epistemic Uncertainty\n  Estimation with Pairwise-Distance Estimators","summary":"  In machine learning, the ability to assess uncertainty in model predictions\nis crucial for decision-making, safety-critical applications, and model\ngeneralizability. This work introduces a novel approach for epistemic\nuncertainty estimation for ensemble models using pairwise-distance estimators\n(PaiDEs). These estimators utilize the pairwise-distance between model\ncomponents to establish bounds on entropy, which are then used as estimates for\ninformation-based criterion. Unlike recent deep learning methods for epistemic\nuncertainty estimation, which rely on sample-based Monte Carlo estimators,\nPaiDEs are able to estimate epistemic uncertainty up to 100 times faster, over\na larger input space (up to 100 times) and perform more accurately in higher\ndimensions. To validate our approach, we conducted a series of experiments\ncommonly used to evaluate epistemic uncertainty estimation: 1D sinusoidal data,\n$\\textit{Pendulum-v0}$, $\\textit{Hopper-v2}$, $\\textit{Ant-v2}$ and\n$\\textit{Humanoid-v2}$. For each experimental setting, an Active Learning\nframework was applied to demonstrate the advantages of PaiDEs for epistemic\nuncertainty estimation.\n","authors":["Lucas Berry","David Meger"],"pdf_url":"https://arxiv.org/pdf/2308.13498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07936v2","updated":"2023-10-02T15:37:23Z","published":"2023-09-14T01:53:45Z","title":"Landscape-Sketch-Step: An AI/ML-Based Metaheuristic for Surrogate\n  Optimization Problems","summary":"  In this paper, we introduce a new heuristics for global optimization in\nscenarios where extensive evaluations of the cost function are expensive,\ninaccessible, or even prohibitive. The method, which we call\nLandscape-Sketch-and-Step (LSS), combines Machine Learning, Stochastic\nOptimization, and Reinforcement Learning techniques, relying on historical\ninformation from previously sampled points to make judicious choices of\nparameter values where the cost function should be evaluated at. Unlike\noptimization by Replica Exchange Monte Carlo methods, the number of evaluations\nof the cost function required in this approach is comparable to that used by\nSimulated Annealing, quality that is especially important in contexts like\nhigh-throughput computing or high-performance computing tasks, where\nevaluations are either computationally expensive or take a long time to be\nperformed. The method also differs from standard Surrogate Optimization\ntechniques, for it does not construct a surrogate model that aims at\napproximating or reconstructing the objective function. We illustrate our\nmethod by applying it to low dimensional optimization problems (dimensions 1,\n2, 4, and 8) that mimick known difficulties of minimization on rugged energy\nlandscapes often seen in Condensed Matter Physics, where cost functions are\nrugged and plagued with local minima. When compared to classical Simulated\nAnnealing, the LSS shows an effective acceleration of the optimization process.\n","authors":["Rafael Monteiro","Kartik Sau"],"pdf_url":"https://arxiv.org/pdf/2309.07936v2.pdf","comment":"Git-hub on\n  https://github.com/rafael-a-monteiro-math/landscape_sketch_and_step/"},{"id":"http://arxiv.org/abs/2303.15564v2","updated":"2023-10-02T15:33:54Z","published":"2023-03-27T19:23:33Z","title":"Mask and Restore: Blind Backdoor Defense at Test Time with Masked\n  Autoencoder","summary":"  Deep neural networks are vulnerable to backdoor attacks, where an adversary\nmaliciously manipulates the model behavior through overlaying images with\nspecial triggers. Existing backdoor defense methods often require accessing a\nfew validation data and model parameters, which are impractical in many\nreal-world applications, e.g., when the model is provided as a cloud service.\nIn this paper, we address the practical task of blind backdoor defense at test\ntime, in particular for black-box models. The true label of every test image\nneeds to be recovered on the fly from a suspicious model regardless of image\nbenignity. We focus on test-time image purification methods that incapacitate\npossible triggers while keeping semantic contents intact. Due to diverse\ntrigger patterns and sizes, the heuristic trigger search in image space can be\nunscalable. We circumvent such barrier by leveraging the strong reconstruction\npower of generative models, and propose a framework of Blind Defense with\nMasked AutoEncoder (BDMAE). It detects possible triggers in the token space\nusing image structural similarity and label consistency between the test image\nand MAE restorations. The detection results are then refined by considering\ntrigger topology. Finally, we fuse MAE restorations adaptively into a purified\nimage for making prediction. Our approach is blind to the model architectures,\ntrigger patterns and image benignity. Extensive experiments under different\nbackdoor settings validate its effectiveness and generalizability. Code is\navailable at https://github.com/tsun/BDMAE.\n","authors":["Tao Sun","Lu Pang","Chao Chen","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2303.15564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08018v3","updated":"2023-10-02T15:27:20Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n  Large Language Models","summary":"  Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a comprehensive instruction dataset\ndesigned for the biomolecular domain. Mol-Instructions encompasses three key\ncomponents: molecule-oriented instructions, protein-oriented instructions, and\nbiomolecular text instructions. Each component aims to improve the\nunderstanding and prediction capabilities of LLMs concerning biomolecular\nfeatures and behaviors. Through extensive instruction tuning experiments on\nLLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large\nmodels' performance in the intricate realm of biomolecular studies, thus\nfostering progress in the biomolecular research community. Mol-Instructions is\npublicly available for ongoing research and will undergo regular updates to\nenhance its applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v3.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions"},{"id":"http://arxiv.org/abs/2301.11259v5","updated":"2023-10-02T15:17:34Z","published":"2023-01-26T17:52:56Z","title":"Domain-Agnostic Molecular Generation with Self-feedback","summary":"  The generation of molecules with desired properties has gained tremendous\npopularity, revolutionizing the way scientists design molecular structures and\nproviding valuable support for chemical and drug design. However, despite the\npotential of language models in molecule generation, they face numerous\nchallenges such as the generation of syntactically or chemically flawed\nmolecules, narrow domain focus, and limitations in creating diverse and\ndirectionally feasible molecules due to a dearth of annotated data or external\nmolecular databases. To tackle these challenges, we introduce MolGen, a\npre-trained molecular language model tailored specifically for molecule\ngeneration. Through the reconstruction of over 100 million molecular SELFIES,\nMolGen internalizes profound structural and grammatical insights. This is\nfurther enhanced by domain-agnostic molecular prefix tuning, fostering robust\nknowledge transfer across diverse domains. Importantly, our self-feedback\nparadigm steers the model away from ``molecular hallucinations'', ensuring\nalignment between the model's estimated probabilities and real-world chemical\npreferences. Extensive experiments on well-known benchmarks underscore MolGen's\noptimization capabilities in properties such as penalized logP, QED, and\nmolecular docking. Additional analyses affirm its proficiency in accurately\ncapturing molecule distributions, discerning intricate structural patterns, and\nefficiently exploring the chemical space. Code is available at\nhttps://github.com/zjunlp/MolGen.\n","authors":["Yin Fang","Ningyu Zhang","Zhuo Chen","Lingbing Guo","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.11259v5.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.18270v2","updated":"2023-10-02T14:51:39Z","published":"2023-05-29T17:43:44Z","title":"How Two-Layer Neural Networks Learn, One (Giant) Step at a Time","summary":"  We investigate theoretically how the features of a two-layer neural network\nadapt to the structure of the target function through a few large batch\ngradient descent steps, leading to improvement in the approximation capacity\nwith respect to the initialization. We compare the influence of batch size and\nthat of multiple (but finitely many) steps. For a single gradient step, a batch\nof size $n = \\mathcal{O}(d)$ is both necessary and sufficient to align with the\ntarget function, although only a single direction can be learned. In contrast,\n$n = \\mathcal{O}(d^2)$ is essential for neurons to specialize to multiple\nrelevant directions of the target with a single gradient step. Even in this\ncase, we show there might exist ``hard'' directions requiring $n =\n\\mathcal{O}(d^\\ell)$ samples to be learned, where $\\ell$ is known as the leap\nindex of the target. The picture drastically improves over multiple gradient\nsteps: we show that a batch-size of $n = \\mathcal{O}(d)$ is indeed enough to\nlearn multiple target directions satisfying a staircase property, where more\nand more directions can be learned over time. Finally, we discuss how these\ndirections allows to drastically improve the approximation capacity and\ngeneralization error over the initialization, illustrating a separation of\nscale between the random features/lazy regime, and the feature learning regime.\nOur technical analysis leverages a combination of techniques related to\nconcentration, projection-based conditioning, and Gaussian equivalence which we\nbelieve are of independent interest. By pinning down the conditions necessary\nfor specialization and learning, our results highlight the interaction between\nbatch size and number of iterations, and lead to a hierarchical depiction where\nlearning performance exhibits a stairway to accuracy over time and batch size,\nshedding new light on how neural networks adapt to features of the data.\n","authors":["Yatin Dandi","Florent Krzakala","Bruno Loureiro","Luca Pesce","Ludovic Stephan"],"pdf_url":"https://arxiv.org/pdf/2305.18270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15498v3","updated":"2023-10-02T14:44:05Z","published":"2022-11-28T16:17:47Z","title":"Physics-informed neural networks with unknown measurement noise","summary":"  Physics-informed neural networks (PINNs) constitute a flexible approach to\nboth finding solutions and identifying parameters of partial differential\nequations. Most works on the topic assume noiseless data, or data contaminated\nby weak Gaussian noise. We show that the standard PINN framework breaks down in\ncase of non-Gaussian noise. We give a way of resolving this fundamental issue\nand we propose to jointly train an energy-based model (EBM) to learn the\ncorrect noise distribution. We illustrate the improved performance of our\napproach using multiple examples.\n","authors":["Philipp Pilar","Niklas Wahlström"],"pdf_url":"https://arxiv.org/pdf/2211.15498v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17425v2","updated":"2023-10-02T14:40:05Z","published":"2023-09-29T17:37:29Z","title":"Data Filtering Networks","summary":"  Large training sets have become a cornerstone of machine learning and are the\nfoundation for recent advances in language modeling and multimodal learning.\nWhile data curation for pre-training is often still ad-hoc, one common paradigm\nis to first collect a massive pool of data from the Web and then filter this\ncandidate pool down to an actual training set via various heuristics. In this\nwork, we study the problem of learning a data filtering network (DFN) for this\nsecond step of filtering a large uncurated dataset. Our key finding is that the\nquality of a network for filtering is distinct from its performance on\ndownstream tasks: for instance, a model that performs well on ImageNet can\nyield worse training sets than a model with low ImageNet accuracy that is\ntrained on a small amount of high-quality data. Based on our insights, we\nconstruct new data filtering networks that induce state-of-the-art image-text\ndatasets. Specifically, our best performing dataset DFN-5B enables us to train\nstate-of-the-art models for their compute budgets: among other improvements on\na variety of tasks, a ViT-H trained on our dataset achieves 83.0% zero-shot\ntransfer accuracy on ImageNet, out-performing models trained on other datasets\nsuch as LAION-2B, DataComp-1B, or OpenAI's WIT. In order to facilitate further\nresearch in dataset design, we also release a new 2 billion example dataset\nDFN-2B and show that high performance data filtering networks can be trained\nfrom scratch using only publicly available data.\n","authors":["Alex Fang","Albin Madappally Jose","Amit Jain","Ludwig Schmidt","Alexander Toshev","Vaishaal Shankar"],"pdf_url":"https://arxiv.org/pdf/2309.17425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10080v2","updated":"2023-10-02T14:39:23Z","published":"2023-06-16T06:41:04Z","title":"AI Driven Near Real-time Locational Marginal Pricing Method: A\n  Feasibility and Robustness Study","summary":"  Accurate price predictions are essential for market participants in order to\noptimize their operational schedules and bidding strategies, especially in the\ncurrent context where electricity prices become more volatile and less\npredictable using classical approaches. The Locational Marginal Pricing (LMP)\npricing mechanism is used in many modern power markets, where the traditional\napproach utilizes optimal power flow (OPF) solvers. However, for large\nelectricity grids this process becomes prohibitively time-consuming and\ncomputationally intensive. Machine learning (ML) based predictions could\nprovide an efficient tool for LMP prediction, especially in energy markets with\nintermittent sources like renewable energy. This study evaluates the\nperformance of popular machine learning and deep learning models in predicting\nLMP on multiple electricity grids. The accuracy and robustness of these models\nin predicting LMP is assessed considering multiple scenarios. The results show\nthat ML models can predict LMP 4-5 orders of magnitude faster than traditional\nOPF solvers with 5-6\\% error rate, highlighting the potential of ML models in\nLMP prediction for large-scale power models with the assistance of hardware\ninfrastructure like multi-core CPUs and GPUs in modern HPC clusters.\n","authors":["Naga Venkata Sai Jitin Jami","Juraj Kardoš","Olaf Schenk","Harald Köstler"],"pdf_url":"https://arxiv.org/pdf/2306.10080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14062v2","updated":"2023-10-02T14:37:44Z","published":"2023-05-23T13:41:52Z","title":"Amplitude-Independent Machine Learning for PPG through Visibility Graphs\n  and Transfer Learning","summary":"  Photoplethysmography (PPG) refers to the measurement of variations in blood\nvolume using light and is a feature of most wearable devices. The PPG signals\nprovide insight into the body's circulatory system and can be employed to\nextract various bio-features, such as heart rate and vascular ageing. Although\nseveral algorithms have been proposed for this purpose, many exhibit\nlimitations, including heavy reliance on human calibration, high signal quality\nrequirements, and a lack of generalisation. In this paper, we introduce a PPG\nsignal processing framework that integrates graph theory and computer vision\nalgorithms, to provide an analysis framework which is amplitude-independent and\ninvariant to affine transformations. It also requires minimal preprocessing,\nfuses information through RGB channels and exhibits robust generalisation\nacross tasks and datasets. The proposed VGTL-net achieves state-of-the-art\nperformance in the prediction of vascular ageing and demonstrates robust\nestimation of continuous blood pressure waveforms.\n","authors":["Yuyang Miao","Harry J. Davies","Danilo P. Mandic"],"pdf_url":"https://arxiv.org/pdf/2305.14062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11160v3","updated":"2023-10-02T14:33:05Z","published":"2023-04-21T17:59:08Z","title":"The Isotonic Mechanism for Exponential Family Estimation","summary":"  In 2023, the International Conference on Machine Learning (ICML) required\nauthors with multiple submissions to rank their submissions based on perceived\nquality. In this paper, we aim to employ these author-specified rankings to\nenhance peer review in machine learning and artificial intelligence conferences\nby extending the Isotonic Mechanism to exponential family distributions. This\nmechanism generates adjusted scores that closely align with the original scores\nwhile adhering to author-specified rankings. Despite its applicability to a\nbroad spectrum of exponential family distributions, implementing this mechanism\ndoes not require knowledge of the specific distribution form. We demonstrate\nthat an author is incentivized to provide accurate rankings when her utility\ntakes the form of a convex additive function of the adjusted review scores. For\na certain subclass of exponential family distributions, we prove that the\nauthor reports truthfully only if the question involves only pairwise\ncomparisons between her submissions, thus indicating the optimality of ranking\nin truthful information elicitation. Moreover, we show that the adjusted scores\nimprove dramatically the estimation accuracy compared to the original scores\nand achieve nearly minimax optimality when the ground-truth scores have bounded\ntotal variation. We conclude the paper by presenting experiments conducted on\nthe ICML 2023 ranking data, which show significant estimation gain using the\nIsotonic Mechanism.\n","authors":["Yuling Yan","Weijie J. Su","Jianqing Fan"],"pdf_url":"https://arxiv.org/pdf/2304.11160v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14838v2","updated":"2023-10-02T14:26:42Z","published":"2023-02-28T18:37:25Z","title":"EvoPrompting: Language Models for Code-Level Neural Architecture Search","summary":"  Given the recent impressive accomplishments of language models (LMs) for code\ngeneration, we explore the use of LMs as adaptive mutation and crossover\noperators for an evolutionary neural architecture search (NAS) algorithm. While\nNAS still proves too difficult a task for LMs to succeed at solely through\nprompting, we find that the combination of evolutionary prompt engineering with\nsoft prompt-tuning, a method we term EvoPrompting, consistently finds diverse\nand high performing models. We first demonstrate that EvoPrompting is effective\non the computationally efficient MNIST-1D dataset, where EvoPrompting produces\nconvolutional architecture variants that outperform both those designed by\nhuman experts and naive few-shot prompting in terms of accuracy and model size.\nWe then apply our method to searching for graph neural networks on the CLRS\nAlgorithmic Reasoning Benchmark, where EvoPrompting is able to design novel\narchitectures that outperform current state-of-the-art models on 21 out of 30\nalgorithmic reasoning tasks while maintaining similar model size. EvoPrompting\nis successful at designing accurate and efficient neural network architectures\nacross a variety of machine learning tasks, while also being general enough for\neasy adaptation to other tasks beyond neural network design.\n","authors":["Angelica Chen","David M. Dohan","David R. So"],"pdf_url":"https://arxiv.org/pdf/2302.14838v2.pdf","comment":"To be presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.17053v2","updated":"2023-10-02T14:23:09Z","published":"2023-09-29T08:26:44Z","title":"On the Power of the Weisfeiler-Leman Test for Graph Motif Parameters","summary":"  Seminal research in the field of graph neural networks (GNNs) has revealed a\ndirect correspondence between the expressive capabilities of GNNs and the\n$k$-dimensional Weisfeiler-Leman ($k$WL) test, a widely-recognized method for\nverifying graph isomorphism. This connection has reignited interest in\ncomprehending the specific graph properties effectively distinguishable by the\n$k$WL test. A central focus of research in this field revolves around\ndetermining the least dimensionality $k$, for which $k$WL can discern graphs\nwith different number of occurrences of a pattern graph $P$. We refer to such a\nleast $k$ as the WL-dimension of this pattern counting problem. This inquiry\ntraditionally delves into two distinct counting problems related to patterns:\nsubgraph counting and induced subgraph counting. Intriguingly, despite their\ninitial appearance as separate challenges with seemingly divergent approaches,\nboth of these problems are interconnected components of a more comprehensive\nproblem: \"graph motif parameters\". In this paper, we provide a precise\ncharacterization of the WL-dimension of labeled graph motif parameters. As\nspecific instances of this result, we obtain characterizations of the\nWL-dimension of the subgraph counting and induced subgraph counting problem for\nevery labeled pattern $P$. We additionally demonstrate that in cases where the\n$k$WL test distinguishes between graphs with varying occurrences of a pattern\n$P$, the exact number of occurrences of $P$ can be computed uniformly using\nonly local information of the last layer of a corresponding GNN. We finally\ndelve into the challenge of recognizing the WL-dimension of various graph\nparameters. We give a polynomial time algorithm for determining the\nWL-dimension of the subgraph counting problem for given pattern $P$, answering\nan open question from previous work.\n","authors":["Matthias Lanzinger","Pablo Barceló"],"pdf_url":"https://arxiv.org/pdf/2309.17053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15111v2","updated":"2023-10-02T14:21:45Z","published":"2023-09-26T17:57:44Z","title":"SGD Finds then Tunes Features in Two-Layer Neural Networks with\n  near-Optimal Sample Complexity: A Case Study in the XOR problem","summary":"  In this work, we consider the optimization process of minibatch stochastic\ngradient descent (SGD) on a 2-layer neural network with data separated by a\nquadratic ground truth function. We prove that with data drawn from the\n$d$-dimensional Boolean hypercube labeled by the quadratic ``XOR'' function $y\n= -x_ix_j$, it is possible to train to a population error $o(1)$ with $d\n\\:\\text{polylog}(d)$ samples. Our result considers simultaneously training both\nlayers of the two-layer-neural network with ReLU activations via standard\nminibatch SGD on the logistic loss. To our knowledge, this work is the first to\ngive a sample complexity of $\\tilde{O}(d)$ for efficiently learning the XOR\nfunction on isotropic data on a standard neural network with standard training.\nOur main technique is showing that the network evolves in two phases: a\n$\\textit{signal-finding}$ phase where the network is small and many of the\nneurons evolve independently to find features, and a $\\textit{signal-heavy}$\nphase, where SGD maintains and balances the features. We leverage the\nsimultaneous training of the layers to show that it is sufficient for only a\nsmall fraction of the neurons to learn features, since those neurons will be\namplified by the simultaneous growth of their second layer weights.\n","authors":["Margalit Glasgow"],"pdf_url":"https://arxiv.org/pdf/2309.15111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13837v2","updated":"2023-10-02T13:55:46Z","published":"2023-09-25T02:50:20Z","title":"Backorder Prediction in Inventory Management: Classification Techniques\n  and Cost Considerations","summary":"  This article introduces an advanced analytical approach for predicting\nbackorders in inventory management. Backorder refers to an order that cannot be\nimmediately fulfilled due to stock depletion. Multiple classification\ntechniques, including Balanced Bagging Classifiers, Fuzzy Logic, Variational\nAutoencoder - Generative Adversarial Networks, and Multi-layer Perceptron\nclassifiers, are assessed in this work using performance evaluation metrics\nsuch as ROC-AUC and PR-AUC. Moreover, this work incorporates a profit function\nand misclassification costs, considering the financial implications and costs\nassociated with inventory management and backorder handling. The results\ndemonstrate the effectiveness of the predictive model in enhancing inventory\nsystem service levels, which leads to customer satisfaction and overall\norganizational performance. Considering interpretability is a significant\naspect of using AI in commercial applications, permutation importance is\napplied to the selected model to determine the importance of features. This\nresearch contributes to the advancement of predictive analytics and offers\nvaluable insights for future investigations in backorder forecasting and\ninventory control optimization for decision-making.\n","authors":["Sarit Maitra","Sukanya Kundu"],"pdf_url":"https://arxiv.org/pdf/2309.13837v2.pdf","comment":"8 pages, 4 figures, IEEE (ICSEC 2023)"},{"id":"http://arxiv.org/abs/2304.06094v3","updated":"2023-10-02T13:34:21Z","published":"2023-04-12T18:20:58Z","title":"Energy-guided Entropic Neural Optimal Transport","summary":"  Energy-based models (EBMs) are known in the Machine Learning community for\ndecades. Since the seminal works devoted to EBMs dating back to the noughties,\nthere have been a lot of efficient methods which solve the generative modelling\nproblem by means of energy potentials (unnormalized likelihood functions). In\ncontrast, the realm of Optimal Transport (OT) and, in particular, neural OT\nsolvers is much less explored and limited by few recent works (excluding\nWGAN-based approaches which utilize OT as a loss function and do not model OT\nmaps themselves). In our work, we bridge the gap between EBMs and\nEntropy-regularized OT. We present a novel methodology which allows utilizing\nthe recent developments and technical improvements of the former in order to\nenrich the latter. From the theoretical perspective, we prove generalization\nbounds for our technique. In practice, we validate its applicability in toy 2D\nand image domains. To showcase the scalability, we empower our method with a\npre-trained StyleGAN and apply it to high-res AFHQ $512\\times 512$ unpaired I2I\ntranslation. For simplicity, we choose simple short- and long-run EBMs as a\nbackbone of our Energy-guided Entropic OT approach, leaving the application of\nmore sophisticated EBMs for future research. Our code is publicly available.\n","authors":["Petr Mokrov","Alexander Korotin","Alexander Kolesov","Nikita Gushchin","Evgeny Burnaev"],"pdf_url":"https://arxiv.org/pdf/2304.06094v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12317v2","updated":"2023-10-02T13:07:37Z","published":"2023-04-24T17:59:52Z","title":"Total-Recon: Deformable Scene Reconstruction for Embodied View Synthesis","summary":"  We explore the task of embodied view synthesis from monocular videos of\ndeformable scenes. Given a minute-long RGBD video of people interacting with\ntheir pets, we render the scene from novel camera trajectories derived from the\nin-scene motion of actors: (1) egocentric cameras that simulate the point of\nview of a target actor and (2) 3rd-person cameras that follow the actor.\nBuilding such a system requires reconstructing the root-body and articulated\nmotion of every actor, as well as a scene representation that supports\nfree-viewpoint synthesis. Longer videos are more likely to capture the scene\nfrom diverse viewpoints (which helps reconstruction) but are also more likely\nto contain larger motions (which complicates reconstruction). To address these\nchallenges, we present Total-Recon, the first method to photorealistically\nreconstruct deformable scenes from long monocular RGBD videos. Crucially, to\nscale to long videos, our method hierarchically decomposes the scene into the\nbackground and objects, whose motion is decomposed into carefully initialized\nroot-body motion and local articulations. To quantify such \"in-the-wild\"\nreconstruction and view synthesis, we collect ground-truth data from a\nspecialized stereo RGBD capture rig for 11 challenging videos, significantly\noutperforming prior methods. Our code, model, and data can be found at\nhttps://andrewsonga.github.io/totalrecon .\n","authors":["Chonghyuk Song","Gengshan Yang","Kangle Deng","Jun-Yan Zhu","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2304.12317v2.pdf","comment":"ICCV 2023 camera-ready version. Project page with code, models, and\n  data: https://andrewsonga.github.io/totalrecon"},{"id":"http://arxiv.org/abs/2305.16183v2","updated":"2023-10-02T13:04:59Z","published":"2023-05-25T15:39:46Z","title":"Passive learning of active causal strategies in agents and language\n  models","summary":"  What can be learned about causality and experimentation from passive data?\nThis question is salient given recent successes of passively-trained language\nmodels in interactive domains such as tool use. Passive learning is inherently\nlimited. However, we show that purely passive learning can in fact allow an\nagent to learn generalizable strategies for determining and using causal\nstructures, as long as the agent can intervene at test time. We formally\nillustrate that learning a strategy of first experimenting, then seeking goals,\ncan allow generalization from passive learning in principle. We then show\nempirically that agents trained via imitation on expert data can indeed\ngeneralize at test time to infer and use causal links which are never present\nin the training data; these agents can also generalize experimentation\nstrategies to novel variable sets never observed in training. We then show that\nstrategies for causal intervention and exploitation can be generalized from\npassive data even in a more complex environment with high-dimensional\nobservations, with the support of natural language explanations. Explanations\ncan even allow passive learners to generalize out-of-distribution from\nperfectly-confounded training data. Finally, we show that language models,\ntrained only on passive next-word prediction, can generalize causal\nintervention strategies from a few-shot prompt containing examples of\nexperimentation, together with explanations and reasoning. These results\nhighlight the surprising power of passive learning of active causal strategies,\nand may help to understand the behaviors and capabilities of language models.\n","authors":["Andrew Kyle Lampinen","Stephanie C Y Chan","Ishita Dasgupta","Andrew J Nam","Jane X Wang"],"pdf_url":"https://arxiv.org/pdf/2305.16183v2.pdf","comment":"Advances in Neural Information Processing Systems (NeurIPS 2023). 10\n  pages main text"},{"id":"http://arxiv.org/abs/2307.10922v2","updated":"2023-10-02T12:57:16Z","published":"2023-07-20T14:47:50Z","title":"Language-based Action Concept Spaces Improve Video Self-Supervised\n  Learning","summary":"  Recent contrastive language image pre-training has led to learning highly\ntransferable and robust image representations. However, adapting these models\nto video domains with minimal supervision remains an open problem. We explore a\nsimple step in that direction, using language tied self-supervised learning to\nadapt an image CLIP model to the video domain. A backbone modified for temporal\nmodeling is trained under self-distillation settings with train objectives\noperating in an action concept space. Feature vectors of various action\nconcepts extracted from a language encoder using relevant textual prompts\nconstruct this space. We introduce two train objectives, concept distillation\nand concept alignment, that retain generality of original representations while\nenforcing relations between actions and their attributes. Our approach improves\nzero-shot and linear probing performance on three action recognition\nbenchmarks.\n","authors":["Kanchana Ranasinghe","Michael Ryoo"],"pdf_url":"https://arxiv.org/pdf/2307.10922v2.pdf","comment":"Presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.13575v2","updated":"2023-10-02T11:40:41Z","published":"2023-06-23T15:55:44Z","title":"Scaling MLPs: A Tale of Inductive Bias","summary":"  In this work we revisit the most fundamental building block in deep learning,\nthe multi-layer perceptron (MLP), and study the limits of its performance on\nvision tasks. Empirical insights into MLPs are important for multiple reasons.\n(1) Given the recent narrative \"less inductive bias is better\", popularized due\nto transformers eclipsing convolutional models, it is natural to explore the\nlimits of this hypothesis. To that end, MLPs offer an ideal test bed, as they\nlack any vision-specific inductive bias. (2) MLPs have almost exclusively been\nthe main protagonist in the deep learning theory literature due to their\nmathematical simplicity, serving as a proxy to explain empirical phenomena\nobserved for more complex architectures. Surprisingly, experimental datapoints\nfor MLPs are very difficult to find in the literature, especially when coupled\nwith large pre-training protocols. This discrepancy between practice and theory\nis worrying: Do MLPs reflect the empirical advances exhibited by practical\nmodels? Or do theorists need to rethink the role of MLPs as a proxy? We provide\ninsights into both these aspects. We show that the performance of MLPs\ndrastically improves with scale (94% on CIFAR10, 81% on CIFAR100, 58% on\nImageNet ReaL), highlighting that lack of inductive bias can indeed be\ncompensated. We observe that MLPs mimic the behaviour of their modern\ncounterparts faithfully, with some components in the learning setting however\nexhibiting stronger or unexpected behaviours. Due to their inherent\ncomputational efficiency, large pre-training experiments become more accessible\nfor academic researchers. All of our experiments were run on a single GPU.\n","authors":["Gregor Bachmann","Sotiris Anagnostidis","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2306.13575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12706v4","updated":"2023-10-02T11:04:23Z","published":"2023-03-16T09:14:48Z","title":"Multi-modal Variational Autoencoders for normative modelling across\n  multiple imaging modalities","summary":"  One of the challenges of studying common neurological disorders is disease\nheterogeneity including differences in causes, neuroimaging characteristics,\ncomorbidities, or genetic variation. Normative modelling has become a popular\nmethod for studying such cohorts where the 'normal' behaviour of a\nphysiological system is modelled and can be used at subject level to detect\ndeviations relating to disease pathology. For many heterogeneous diseases, we\nexpect to observe abnormalities across a range of neuroimaging and biological\nvariables. However, thus far, normative models have largely been developed for\nstudying a single imaging modality. We aim to develop a multi-modal normative\nmodelling framework where abnormality is aggregated across variables of\nmultiple modalities and is better able to detect deviations than uni-modal\nbaselines. We propose two multi-modal VAE normative models to detect subject\nlevel deviations across T1 and DTI data. Our proposed models were better able\nto detect diseased individuals, capture disease severity, and correlate with\npatient cognition than baseline approaches. We also propose a multivariate\nlatent deviation metric, measuring deviations from the joint latent space,\nwhich outperformed feature-based metrics.\n","authors":["Ana Lawry Aguila","James Chapman","Andre Altmann"],"pdf_url":"https://arxiv.org/pdf/2303.12706v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15546v2","updated":"2023-10-02T09:56:53Z","published":"2023-07-28T13:22:32Z","title":"On the Trade-off Between Efficiency and Precision of Neural Abstraction","summary":"  Neural abstractions have been recently introduced as formal approximations of\ncomplex, nonlinear dynamical models. They comprise a neural ODE and a certified\nupper bound on the error between the abstract neural network and the concrete\ndynamical model. So far neural abstractions have exclusively been obtained as\nneural networks consisting entirely of $ReLU$ activation functions, resulting\nin neural ODE models that have piecewise affine dynamics, and which can be\nequivalently interpreted as linear hybrid automata. In this work, we observe\nthat the utility of an abstraction depends on its use: some scenarios might\nrequire coarse abstractions that are easier to analyse, whereas others might\nrequire more complex, refined abstractions. We therefore consider neural\nabstractions of alternative shapes, namely either piecewise constant or\nnonlinear non-polynomial (specifically, obtained via sigmoidal activations). We\nemploy formal inductive synthesis procedures to generate neural abstractions\nthat result in dynamical models with these semantics. Empirically, we\ndemonstrate the trade-off that these different neural abstraction templates\nhave vis-a-vis their precision and synthesis time, as well as the time required\nfor their safety verification (done via reachability computation). We improve\nexisting synthesis techniques to enable abstraction of higher-dimensional\nmodels, and additionally discuss the abstraction of complex neural ODEs to\nimprove the efficiency of reachability analysis for these models.\n","authors":["Alec Edwards","Mirco Giacobbe","Alessandro Abate"],"pdf_url":"https://arxiv.org/pdf/2307.15546v2.pdf","comment":"Appeared at QEST 2023. Added codebase link; corrected Eq. 11"},{"id":"http://arxiv.org/abs/2308.13328v2","updated":"2023-10-02T09:56:00Z","published":"2023-08-25T12:02:13Z","title":"Compressor-Based Classification for Atrial Fibrillation Detection","summary":"  Atrial fibrillation (AF) is one of the most common arrhythmias with\nchallenging public health implications. Therefore, automatic detection of AF\nepisodes on ECG is one of the essential tasks in biomedical engineering. In\nthis paper, we applied the recently introduced method of compressor-based text\nclassification with gzip algorithm for AF detection (binary classification\nbetween heart rhythms). We investigated the normalized compression distance\napplied to RR-interval and $\\Delta$RR-interval sequences ($\\Delta$RR-interval\nis the difference between subsequent RR-intervals). Here, the configuration of\nthe k-nearest neighbour classifier, an optimal window length, and the choice of\ndata types for compression were analyzed. We achieved good classification\nresults while learning on the full MIT-BIH Atrial Fibrillation database, close\nto the best specialized AF detection algorithms (avg. sensitivity = 97.1\\%,\navg. specificity = 91.7\\%, best sensitivity of 99.8\\%, best specificity of\n97.6\\% with fivefold cross-validation). In addition, we evaluated the\nclassification performance under the few-shot learning setting. Our results\nsuggest that gzip compression-based classification, originally proposed for\ntexts, is suitable for biomedical data and quantized continuous stochastic\nsequences in general.\n","authors":["Nikita Markov","Konstantin Ushenin","Yakov Bozhko","Olga Solovyova"],"pdf_url":"https://arxiv.org/pdf/2308.13328v2.pdf","comment":"This paper is sent for review at the IEEE conference, 2023"},{"id":"http://arxiv.org/abs/2309.17446v2","updated":"2023-10-02T09:54:50Z","published":"2023-09-29T17:57:00Z","title":"L2CEval: Evaluating Language-to-Code Generation Capabilities of Large\n  Language Models","summary":"  Recently, large language models (LLMs), especially those that are pretrained\non code, have demonstrated strong capabilities in generating programs from\nnatural language inputs in a few-shot or even zero-shot manner. Despite\npromising results, there is a notable lack of a comprehensive evaluation of\nthese models language-to-code generation capabilities. Existing studies often\nfocus on specific tasks, model architectures, or learning paradigms, leading to\na fragmented understanding of the overall landscape. In this work, we present\nL2CEval, a systematic evaluation of the language-to-code generation\ncapabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing,\nmath reasoning and Python programming, analyzing the factors that potentially\naffect their performance, such as model size, pretraining data, instruction\ntuning, and different prompting methods. In addition to assessing model\nperformance, we measure confidence calibration for the models and conduct human\nevaluations of the output programs. This enables us to identify and analyze the\ntypical failure modes across various tasks and models. L2CEval offers a\ncomprehensive understanding of the capabilities and limitations of LLMs in\nlanguage-to-code generation. We also release the evaluation framework and all\nmodel outputs, hoping to lay the groundwork for further future research in this\ndomain.\n","authors":["Ansong Ni","Pengcheng Yin","Yilun Zhao","Martin Riddell","Troy Feng","Rui Shen","Stephen Yin","Ye Liu","Semih Yavuz","Caiming Xiong","Shafiq Joty","Yingbo Zhou","Dragomir Radev","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2309.17446v2.pdf","comment":"Project Website: https://l2c-eval.github.io/"},{"id":"http://arxiv.org/abs/2303.09874v2","updated":"2023-10-02T09:40:54Z","published":"2023-03-17T10:38:27Z","title":"Disentangling the Link Between Image Statistics and Human Perception","summary":"  In the 1950s, Barlow and Attneave hypothesised a link between biological\nvision and information maximisation. Following Shannon, information was defined\nusing the probability of natural images. A number of physiological and\npsychophysical phenomena have been derived ever since from principles like\ninfo-max, efficient coding, or optimal denoising. However, it remains unclear\nhow this link is expressed in mathematical terms from image probability. First,\nclassical derivations were subjected to strong assumptions on the probability\nmodels and on the behaviour of the sensors. Moreover, the direct evaluation of\nthe hypothesis was limited by the inability of the classical image models to\ndeliver accurate estimates of the probability. In this work we directly\nevaluate image probabilities using an advanced generative model for natural\nimages, and we analyse how probability-related factors can be combined to\npredict human perception via sensitivity of state-of-the-art subjective image\nquality metrics. We use information theory and regression analysis to find a\ncombination of just two probability-related factors that achieves 0.8\ncorrelation with subjective metrics. This probability-based sensitivity is\npsychophysically validated by reproducing the basic trends of the Contrast\nSensitivity Function, its suprathreshold variation, and trends of the Weber-law\nand masking.\n","authors":["Alexander Hepburn","Valero Laparra","Raúl Santos-Rodriguez","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2303.09874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05599v5","updated":"2023-10-02T09:26:37Z","published":"2023-01-13T15:04:32Z","title":"Short-length SSVEP data extension by a novel generative adversarial\n  networks based framework","summary":"  Steady-state visual evoked potentials (SSVEPs) based brain-computer interface\n(BCI) has received considerable attention due to its high information transfer\nrate (ITR) and available quantity of targets. However, the performance of\nfrequency identification methods heavily hinges on the amount of user\ncalibration data and data length, which hinders the deployment in real-world\napplications. Recently, generative adversarial networks (GANs)-based data\ngeneration methods have been widely adopted to create synthetic\nelectroencephalography (EEG) data, holds promise to address these issues. In\nthis paper, we proposed a GAN-based end-to-end signal transformation network\nfor Time-window length Extension, termed as TEGAN. TEGAN transforms\nshort-length SSVEP signals into long-length artificial SSVEP signals. By\nincorporating a novel U-Net generator architecture and an auxiliary classifier\ninto the network architecture, the TEGAN could produce conditioned features in\nthe synthetic data. Additionally, we introduced a two-stage training strategy\nand the LeCam-divergence regularization term to regularize the training process\nof GAN during the network implementation. The proposed TEGAN was evaluated on\ntwo public SSVEP datasets (a 4-class dataset and a 12-class dataset). With the\nassistance of TEGAN, the performance of traditional frequency recognition\nmethods and deep learning-based methods have been significantly improved under\nlimited calibration data. And the classification performance gap of various\nfrequency recognition methods has been narrowed. This study substantiates the\nfeasibility of the proposed method to extend the data length for short-time\nSSVEP signals for developing a high-performance BCI system. The proposed\nGAN-based methods have the great potential of shortening the calibration time\nand cutting down the budget for various real-world BCI-based applications.\n","authors":["Yudong Pan","Ning Li","Yangsong Zhang","Peng Xu","Dezhong Yao"],"pdf_url":"https://arxiv.org/pdf/2301.05599v5.pdf","comment":"16 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2206.09380v2","updated":"2023-10-02T09:00:21Z","published":"2022-06-19T11:16:44Z","title":"Supervision Adaptation Balancing In-distribution Generalization and\n  Out-of-distribution Detection","summary":"  The discrepancy between in-distribution (ID) and out-of-distribution (OOD)\nsamples can lead to \\textit{distributional vulnerability} in deep neural\nnetworks, which can subsequently lead to high-confidence predictions for OOD\nsamples. This is mainly due to the absence of OOD samples during training,\nwhich fails to constrain the network properly. To tackle this issue, several\nstate-of-the-art methods include adding extra OOD samples to training and\nassign them with manually-defined labels. However, this practice can introduce\nunreliable labeling, negatively affecting ID classification. The distributional\nvulnerability presents a critical challenge for non-IID deep learning, which\naims for OOD-tolerant ID classification by balancing ID generalization and OOD\ndetection. In this paper, we introduce a novel \\textit{supervision adaptation}\napproach to generate adaptive supervision information for OOD samples, making\nthem more compatible with ID samples. Firstly, we measure the dependency\nbetween ID samples and their labels using mutual information, revealing that\nthe supervision information can be represented in terms of negative\nprobabilities across all classes. Secondly, we investigate data correlations\nbetween ID and OOD samples by solving a series of binary regression problems,\nwith the goal of refining the supervision information for more distinctly\nseparable ID classes. Our extensive experiments on four advanced network\narchitectures, two ID datasets, and eleven diversified OOD datasets demonstrate\nthe efficacy of our supervision adaptation approach in improving both ID\nclassification and OOD detection capabilities.\n","authors":["Zhilin Zhao","Longbing Cao","Kun-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2206.09380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.01125v2","updated":"2023-10-02T08:39:18Z","published":"2022-02-02T16:34:15Z","title":"GLISp-r: A preference-based optimization algorithm with convergence\n  guarantees","summary":"  Preference-based optimization algorithms are iterative procedures that seek\nthe optimal calibration of a decision vector based only on comparisons between\ncouples of different tunings. At each iteration, a human decision-maker\nexpresses a preference between two calibrations (samples), highlighting which\none, if any, is better than the other. The optimization procedure must use the\nobserved preferences to find the tuning of the decision vector that is most\npreferred by the decision-maker, while also minimizing the number of\ncomparisons. In this work, we formulate the preference-based optimization\nproblem from a utility theory perspective. Then, we propose GLISp-r, an\nextension of a recent preference-based optimization procedure called GLISp. The\nlatter uses a Radial Basis Function surrogate to describe the tastes of the\ndecision-maker. Iteratively, GLISp proposes new samples to compare with the\nbest calibration available by trading off exploitation of the surrogate model\nand exploration of the decision space. In GLISp-r, we propose a different\ncriterion to use when looking for new candidate samples that is inspired by\nMSRS, a popular procedure in the black-box optimization framework. Compared to\nGLISp, GLISp-r is less likely to get stuck on local optima of the\npreference-based optimization problem. We motivate this claim theoretically,\nwith a proof of global convergence, and empirically, by comparing the\nperformances of GLISp and GLISp-r on several benchmark optimization problems.\n","authors":["Davide Previtali","Mirko Mazzoleni","Antonio Ferramosca","Fabio Previdi"],"pdf_url":"https://arxiv.org/pdf/2202.01125v2.pdf","comment":"Journal version available at:\n  https://doi.org/10.1007/s10589-023-00491-2 28 pages, 7 figures and 1 table"},{"id":"http://arxiv.org/abs/2307.09882v3","updated":"2023-10-02T08:35:09Z","published":"2023-07-19T10:26:29Z","title":"Adversarial Likelihood Estimation With One-Way Flows","summary":"  Generative Adversarial Networks (GANs) can produce high-quality samples, but\ndo not provide an estimate of the probability density around the samples.\nHowever, it has been noted that maximizing the log-likelihood within an\nenergy-based setting can lead to an adversarial framework where the\ndiscriminator provides unnormalized density (often called energy). We further\ndevelop this perspective, incorporate importance sampling, and show that 1)\nWasserstein GAN performs a biased estimate of the partition function, and we\npropose instead to use an unbiased estimator; and 2) when optimizing for\nlikelihood, one must maximize generator entropy. This is hypothesized to\nprovide a better mode coverage. Different from previous works, we explicitly\ncompute the density of the generated samples. This is the key enabler to\ndesigning an unbiased estimator of the partition function and computation of\nthe generator entropy term. The generator density is obtained via a new type of\nflow network, called one-way flow network, that is less constrained in terms of\narchitecture, as it does not require a tractable inverse function. Our\nexperimental results show that our method converges faster, produces comparable\nsample quality to GANs with similar architecture, successfully avoids\nover-fitting to commonly used datasets and produces smooth low-dimensional\nlatent representations of the training data.\n","authors":["Omri Ben-Dov","Pravir Singh Gupta","Victoria Abrevaya","Michael J. Black","Partha Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.09882v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17097v2","updated":"2023-10-02T07:45:47Z","published":"2023-09-29T09:47:18Z","title":"Benchmarking Collaborative Learning Methods Cost-Effectiveness for\n  Prostate Segmentation","summary":"  Healthcare data is often split into medium/small-sized collections across\nmultiple hospitals and access to it is encumbered by privacy regulations. This\nbrings difficulties to use them for the development of machine learning and\ndeep learning models, which are known to be data-hungry. One way to overcome\nthis limitation is to use collaborative learning (CL) methods, which allow\nhospitals to work collaboratively to solve a task, without the need to\nexplicitly share local data.\n  In this paper, we address a prostate segmentation problem from MRI in a\ncollaborative scenario by comparing two different approaches: federated\nlearning (FL) and consensus-based methods (CBM).\n  To the best of our knowledge, this is the first work in which CBM, such as\nlabel fusion techniques, are used to solve a problem of collaborative learning.\nIn this setting, CBM combine predictions from locally trained models to obtain\na federated strong learner with ideally improved robustness and predictive\nvariance properties.\n  Our experiments show that, in the considered practical scenario, CBMs provide\nequal or better results than FL, while being highly cost-effective. Our results\ndemonstrate that the consensus paradigm may represent a valid alternative to FL\nfor typical training tasks in medical imaging.\n","authors":["Lucia Innocenti","Michela Antonelli","Francesco Cremonesi","Kenaan Sarhan","Alejandro Granados","Vicky Goh","Sebastien Ourselin","Marco Lorenzi"],"pdf_url":"https://arxiv.org/pdf/2309.17097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12533v2","updated":"2023-10-02T07:38:33Z","published":"2023-02-24T09:38:41Z","title":"HUST bearing: a practical dataset for ball bearing fault diagnosis","summary":"  In this work, we introduce a practical dataset named HUST bearing, that\nprovides a large set of vibration data on different ball bearings. This dataset\ncontains 90 raw vibration data of 6 types of defects (inner crack, outer crack,\nball crack, and their 2-combinations) on 5 types of bearing at 3 working\nconditions with the sample rate of 51,200 samples per second. We established\nthe envelope analysis and order tracking analysis on the introduced dataset to\nallow an initial evaluation of the data. A number of classical machine learning\nclassification methods are used to identify bearing faults of the dataset using\nfeatures in different domains. The typical advanced unsupervised transfer\nlearning algorithms also perform to observe the transferability of knowledge\namong parts of the dataset. The experimental results of examined methods on the\ndataset gain divergent accuracy up to 100% on classification task and 60-80% on\nunsupervised transfer learning task.\n","authors":["Nguyen Duc Thuan","Hoang Si Hong"],"pdf_url":"https://arxiv.org/pdf/2302.12533v2.pdf","comment":"We are considering some issues in the paper"},{"id":"http://arxiv.org/abs/2204.06350v2","updated":"2023-10-02T06:23:35Z","published":"2022-04-13T13:02:21Z","title":"LDPC codes: comparing cluster graphs to factor graphs","summary":"  We present a comparison study between a cluster and factor graph\nrepresentation of LDPC codes. In probabilistic graphical models, cluster graphs\nretain useful dependence between random variables during inference, which are\nadvantageous in terms of computational cost, convergence speed, and accuracy of\nmarginal probabilities. This study investigates these benefits in the context\nof LDPC codes and shows that a cluster graph representation outperforms the\ntraditional factor graph representation.\n","authors":["J du Toit","J du Preez","R Wolhuter"],"pdf_url":"https://arxiv.org/pdf/2204.06350v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2204.07037v2","updated":"2023-10-02T06:17:53Z","published":"2022-04-13T12:55:02Z","title":"LDPC codes: tracking non-stationary channel noise using sequential\n  variational Bayesian estimates","summary":"  We present a sequential Bayesian learning method for tracking non-stationary\nsignal-to-noise ratios in LDPC codes using probabilistic graphical models. We\nrepresent the LDPC code as a cluster graph using a general purpose cluster\ngraph construction algorithm called the layered trees running intersection\nproperty (LTRIP) algorithm. The channel noise estimator is a global Gamma\ncluster, which we extend to allow for Bayesian tracking of non-stationary noise\nvariation. We evaluate our proposed model on real-world 5G drive test data. Our\nresults show that our model is capable of tracking non-stationary channel\nnoise, which outperforms an LDPC code with a fixed knowledge of the actual\naverage channel noise.\n","authors":["J du Toit","J du Preez","R Wolhuter"],"pdf_url":"https://arxiv.org/pdf/2204.07037v2.pdf","comment":"10 pages, 3 figures. arXiv admin note: text overlap with\n  arXiv:2204.06350"},{"id":"http://arxiv.org/abs/2306.11644v2","updated":"2023-10-02T06:12:30Z","published":"2023-06-20T16:14:25Z","title":"Textbooks Are All You Need","summary":"  We introduce phi-1, a new large language model for code, with significantly\nsmaller size than competing models: phi-1 is a Transformer-based model with\n1.3B parameters, trained for 4 days on 8 A100s, using a selection of ``textbook\nquality\" data from the web (6B tokens) and synthetically generated textbooks\nand exercises with GPT-3.5 (1B tokens). Despite this small scale, phi-1 attains\npass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP. It also displays\nsurprising emergent properties compared to phi-1-base, our model before our\nfinetuning stage on a dataset of coding exercises, and phi-1-small, a smaller\nmodel with 350M parameters trained with the same pipeline as phi-1 that still\nachieves 45% on HumanEval.\n","authors":["Suriya Gunasekar","Yi Zhang","Jyoti Aneja","Caio César Teodoro Mendes","Allie Del Giorno","Sivakanth Gopi","Mojan Javaheripi","Piero Kauffmann","Gustavo de Rosa","Olli Saarikivi","Adil Salim","Shital Shah","Harkirat Singh Behl","Xin Wang","Sébastien Bubeck","Ronen Eldan","Adam Tauman Kalai","Yin Tat Lee","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2306.11644v2.pdf","comment":"26 pages; changed color scheme of plot. fixed minor typos and added\n  couple clarifications"},{"id":"http://arxiv.org/abs/2305.16610v2","updated":"2023-10-02T06:11:19Z","published":"2023-05-26T04:02:54Z","title":"Slingshot Perturbation to Learning in Monotone Games","summary":"  This paper addresses the problem of learning Nash equilibria in {\\it monotone\ngames} where the gradient of the payoff functions is monotone in the strategy\nprofile space, potentially containing additive noise. The optimistic family of\nlearning algorithms, exemplified by optimistic Follow-the-Regularized-Leader\nand optimistic Mirror Descent, successfully achieves last-iterate convergence\nin scenarios devoid of noise, leading the dynamics to a Nash equilibrium. A\nrecent emerging trend underscores the promise of the perturbation approach,\nwhere payoff functions are perturbed based on the distance from an anchoring,\nor {\\it slingshot}, strategy. In response, we first establish a unified\nframework for learning equilibria in monotone games, accommodating both full\nand noisy feedback. Second, we construct the convergence rates toward an\napproximated equilibrium, irrespective of noise presence. Thirdly, we introduce\na twist by updating the slingshot strategy, anchoring the current strategy at\nfinite intervals. This innovation empowers us to identify the exact Nash\nequilibrium of the underlying game with guaranteed rates. The proposed\nframework is all-encompassing, integrating existing payoff-perturbed\nalgorithms. Finally, empirical demonstrations affirm that our algorithms,\ngrounded in this framework, exhibit significantly accelerated convergence.\n","authors":["Kenshi Abe","Kaito Ariu","Mitsuki Sakamoto","Atsushi Iwasaki"],"pdf_url":"https://arxiv.org/pdf/2305.16610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17260v2","updated":"2023-10-02T06:05:33Z","published":"2023-09-29T14:12:54Z","title":"PlaceNav: Topological Navigation through Place Recognition","summary":"  Recent results suggest that splitting topological navigation into\nrobot-independent and robot-specific components improves navigation performance\nby enabling the robot-independent part to be trained with data collected by\ndifferent robot types. However, the navigation methods are still limited by the\nscarcity of suitable training data and suffer from poor computational scaling.\nIn this work, we present~\\methodname, subdividing the robot-independent part\ninto navigation-specific and generic computer vision components. We utilize\nvisual place recognition for the subgoal selection of the topological\nnavigation pipeline. This makes subgoal selection more efficient and enables\nleveraging large-scale datasets from non-robotics sources, increasing training\ndata availability. Bayes filtering, enabled by place recognition, further\nimproves navigation performance by increasing the temporal consistency of\nsubgoals. Our experimental results verify the design and the new model obtains\na 76% higher success rate in indoor and 23% higher in outdoor navigation tasks\nwith higher computational efficiency.\n","authors":["Lauri Suomela","Jussi Kalliola","Harry Edelman","Joni-Kristian Kämäräinen"],"pdf_url":"https://arxiv.org/pdf/2309.17260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01731v2","updated":"2023-10-02T05:47:40Z","published":"2023-06-02T17:57:53Z","title":"PAGAR: Taming Reward Misalignment in Inverse Reinforcement\n  Learning-Based Imitation Learning with Protagonist Antagonist Guided\n  Adversarial Reward","summary":"  Many imitation learning (IL) algorithms employ inverse reinforcement learning\n(IRL) to infer the underlying reward function that an expert is implicitly\noptimizing for, based on their demonstrated behaviors. However, a misalignment\nbetween the inferred reward and the true task objective can result in task\nfailures. In this paper, we introduce Protagonist Antagonist Guided Adversarial\nReward (PAGAR), a semi-supervised reward design paradigm to tackle this reward\nmisalignment problem in IRL-based IL. We identify the conditions on the\ncandidate reward functions under which PAGAR can guarantee to induce a policy\nthat succeeds in the underlying task. Furthermore, we present a practical\non-and-off policy approach to implement PAGAR in IRL-based IL. Experimental\nresults show that our algorithm outperforms competitive baselines on complex IL\ntasks and zero-shot IL tasks in transfer environments with limited\ndemonstrations.\n","authors":["Weichao Zhou","Wenchao Li"],"pdf_url":"https://arxiv.org/pdf/2306.01731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02037v2","updated":"2023-10-02T05:45:19Z","published":"2023-07-05T05:42:03Z","title":"Reverse Diffusion Monte Carlo","summary":"  The efficacy of modern generative models is commonly contingent upon the\nprecision of score estimation along the diffusion path, with a focus on\ndiffusion models and their ability to generate high-quality data samples. This\nstudy delves into the application of reverse diffusion to Monte Carlo sampling.\nIt is shown that score estimation can be transformed into a mean estimation\nproblem via the decomposition of the transition kernel. By estimating the mean\nof the posterior distribution, we derive a novel Monte Carlo sampling algorithm\nfrom the reverse diffusion process, which is distinct from traditional Markov\nChain Monte Carlo (MCMC) methods. We calculate the error requirements and\nsample size for the posterior distribution, and use the result to derive an\nalgorithm that can approximate the target distribution to any desired accuracy.\nAdditionally, by estimating the log-Sobolev constant of the posterior\ndistribution, we show under suitable conditions the problem of sampling from\nthe posterior can be easier than direct sampling from the target distribution\nusing traditional MCMC techniques. For Gaussian mixture models, we demonstrate\nthat the new algorithm achieves significant improvement over the traditional\nLangevin-style MCMC sampling methods both theoretically and practically. Our\nalgorithm offers a new perspective and solution beyond classical MCMC\nalgorithms for challenging complex distributions.\n","authors":["Xunpeng Huang","Hanze Dong","Yifan Hao","Yian Ma","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.02037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10569v2","updated":"2023-10-02T05:45:15Z","published":"2023-09-19T12:26:56Z","title":"Task Graph offloading via Deep Reinforcement Learning in Mobile Edge\n  Computing","summary":"  Various mobile applications that comprise dependent tasks are gaining\nwidespread popularity and are increasingly complex. These applications often\nhave low-latency requirements, resulting in a significant surge in demand for\ncomputing resources. With the emergence of mobile edge computing (MEC), it\nbecomes the most significant issue to offload the application tasks onto\nsmall-scale devices deployed at the edge of the mobile network for obtaining a\nhigh-quality user experience. However, since the environment of MEC is dynamic,\nmost existing works focusing on task graph offloading, which rely heavily on\nexpert knowledge or accurate analytical models, fail to fully adapt to such\nenvironmental changes, resulting in the reduction of user experience. This\npaper investigates the task graph offloading in MEC, considering the\ntime-varying computation capabilities of edge computing devices. To adapt to\nenvironmental changes, we model the task graph scheduling for computation\noffloading as a Markov Decision Process (MDP). Then, we design a deep\nreinforcement learning algorithm (SATA-DRL) to learn the task scheduling\nstrategy from the interaction with the environment, to improve user experience.\nExtensive simulations validate that SATA-DRL is superior to existing strategies\nin terms of reducing average makespan and deadline violation.\n","authors":["Jiagang Liu","Yun Mi","Xinyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.10569v2.pdf","comment":"13 figures"},{"id":"http://arxiv.org/abs/2307.05831v2","updated":"2023-10-02T03:50:18Z","published":"2023-07-11T22:53:09Z","title":"Memorization Through the Lens of Curvature of Loss Function Around\n  Samples","summary":"  Deep neural networks are over-parameterized and easily overfit the datasets\nthey train on. In the extreme case, it has been shown that these networks can\nmemorize a training set with fully randomized labels. We propose using the\ncurvature of loss function around each training sample, averaged over training\nepochs, as a measure of memorization of the sample. We use this metric to study\nthe generalization versus memorization properties of different samples in\npopular image datasets and show that it captures memorization statistics well,\nboth qualitatively and quantitatively. We first show that the high curvature\nsamples visually correspond to long-tailed, mislabeled, or conflicting samples,\nthose that are most likely to be memorized. This analysis helps us find, to the\nbest of our knowledge, a novel failure mode on the CIFAR100 and ImageNet\ndatasets: that of duplicated images with differing labels. Quantitatively, we\ncorroborate the validity of our scores via two methods. First, we validate our\nscores against an independent and comprehensively calculated baseline, by\nshowing high cosine similarity with the memorization scores released by Feldman\nand Zhang (2020). Second, we inject corrupted samples which are memorized by\nthe network, and show that these are learned with high curvature. To this end,\nwe synthetically mislabel a random subset of the dataset. We overfit a network\nto it and show that sorting by curvature yields high AUROC values for\nidentifying the corrupted samples. An added advantage of our method is that it\nis scalable, as it requires training only a single network as opposed to the\nthousands trained by the baseline, while capturing the aforementioned failure\nmode that the baseline fails to identify.\n","authors":["Isha Garg","Deepak Ravikumar","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2307.05831v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2303.01346v3","updated":"2023-10-02T03:45:15Z","published":"2023-03-02T15:24:24Z","title":"Co-learning Planning and Control Policies Constrained by Differentiable\n  Logic Specifications","summary":"  Synthesizing planning and control policies in robotics is a fundamental task,\nfurther complicated by factors such as complex logic specifications and\nhigh-dimensional robot dynamics. This paper presents a novel reinforcement\nlearning approach to solving high-dimensional robot navigation tasks with\ncomplex logic specifications by co-learning planning and control policies.\nNotably, this approach significantly reduces the sample complexity in training,\nallowing us to train high-quality policies with much fewer samples compared to\nexisting reinforcement learning algorithms. In addition, our methodology\nstreamlines complex specification extraction from map images and enables the\nefficient generation of long-horizon robot motion paths across different map\nlayouts. Moreover, our approach also demonstrates capabilities for\nhigh-dimensional control and avoiding suboptimal policies via policy alignment.\nThe efficacy of our approach is demonstrated through experiments involving\nsimulated high-dimensional quadruped robot dynamics and a real-world\ndifferential drive robot (TurtleBot3) under different types of task\nspecifications.\n","authors":["Zikang Xiong","Daniel Lawson","Joe Eappen","Ahmed H. Qureshi","Suresh Jagannathan"],"pdf_url":"https://arxiv.org/pdf/2303.01346v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16189v4","updated":"2023-10-02T02:58:46Z","published":"2023-07-30T10:03:36Z","title":"Trustworthy Optimization: A Novel Approach to Counter Numerical\n  Instability in 16-bit Neural Network Training","summary":"  In this research, we address critical trustworthiness concerns related to the\nnumerical instability observed in 16-bit computations of machine learning\nmodels. Such instability, particularly when employing popular optimization\nalgorithms like RMSProp and Adam, often leads to unreliable training of deep\nneural networks. This not only disrupts the learning process but also poses\nsignificant challenges in deploying dependable models in real-world\napplications. Our investigation identifies the epsilon hyperparameter as the\nprimary source of this instability. A nuanced exploration reveals that subtle\nadjustments to epsilon within 16-bit computations can enhance the reliability\nof RMSProp and Adam, enabling more trustworthy training of 16-bit neural\nnetworks. We propose a novel, dependable approach that leverages updates from\nthe Adam optimizer to bolster the stability of the learning process. Our\ncontributions provide deeper insights into optimization challenges in\nlow-precision computations and offer solutions to ensure the trustworthiness\nand stability of deep neural network training, paving the way for their\ndependable use in various applications.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2307.16189v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16721v2","updated":"2023-10-02T02:30:41Z","published":"2023-03-29T14:17:21Z","title":"Performance-guaranteed regularization in maximum likelihood method:\n  Gauge symmetry in Kullback -- Leibler divergence","summary":"  The maximum likelihood method is the best-known method for estimating the\nprobabilities behind the data. However, the conventional method obtains the\nprobability model closest to the empirical distribution, resulting in\noverfitting. Then regularization methods prevent the model from being\nexcessively close to the wrong probability, but little is known systematically\nabout their performance. The idea of regularization is similar to\nerror-correcting codes, which obtain optimal decoding by mixing suboptimal\nsolutions with an incorrectly received code. The optimal decoding in\nerror-correcting codes is achieved based on gauge symmetry. We propose a\ntheoretically guaranteed regularization in the maximum likelihood method by\nfocusing on a gauge symmetry in Kullback -- Leibler divergence. In our\napproach, we obtain the optimal model without the need to search for\nhyperparameters frequently appearing in regularization.\n","authors":["Akihisa Ichiki"],"pdf_url":"https://arxiv.org/pdf/2303.16721v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.16708v2","updated":"2023-10-02T02:22:05Z","published":"2023-07-31T14:26:41Z","title":"Deep Learning Meets Adaptive Filtering: A Stein's Unbiased Risk\n  Estimator Approach","summary":"  This paper revisits two prominent adaptive filtering algorithms through the\nlens of algorithm unrolling, namely recursive least squares (RLS) and\nequivariant adaptive source separation (EASI), in the context of source\nestimation and separation. Building upon the unrolling methodology, we\nintroduce novel task-based deep learning frameworks, denoted as Deep RLS and\nDeep EASI. These architectures transform the iterations of the original\nalgorithms into layers of a deep neural network, thereby enabling efficient\nsource signal estimation by taking advantage of a training process. To further\nenhance performance, we propose training these deep unrolled networks utilizing\na loss function grounded on a Stein's unbiased risk estimator (SURE). Our\nempirical evaluations demonstrate the efficacy of this SURE-based approach for\nenhanced source signal estimation.\n","authors":["Zahra Esmaeilbeig","Mojtaba Soltanalian"],"pdf_url":"https://arxiv.org/pdf/2307.16708v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2011.07458"},{"id":"http://arxiv.org/abs/2304.14343v6","updated":"2023-10-02T02:19:44Z","published":"2023-04-27T17:19:26Z","title":"LibCity: A Unified Library Towards Efficient and Comprehensive Urban\n  Spatial-Temporal Prediction","summary":"  As deep learning technology advances and more urban spatial-temporal data\naccumulates, an increasing number of deep learning models are being proposed to\nsolve urban spatial-temporal prediction problems. However, there are\nlimitations in the existing field, including open-source data being in various\nformats and difficult to use, few papers making their code and data openly\navailable, and open-source models often using different frameworks and\nplatforms, making comparisons challenging. A standardized framework is urgently\nneeded to implement and evaluate these methods. To address these issues, we\npropose LibCity, an open-source library that offers researchers a credible\nexperimental tool and a convenient development framework. In this library, we\nhave reproduced 65 spatial-temporal prediction models and collected 55\nspatial-temporal datasets, allowing researchers to conduct comprehensive\nexperiments conveniently. By enabling fair model comparisons, designing a\nunified data storage format, and simplifying the process of developing new\nmodels, LibCity is poised to make significant contributions to the\nspatial-temporal prediction field.\n","authors":["Jiawei Jiang","Chengkai Han","Wenjun Jiang","Wayne Xin Zhao","Jingyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.14343v6.pdf","comment":"Extended version of https://dl.acm.org/doi/10.1145/3474717.3483923"},{"id":"http://arxiv.org/abs/2303.08040v3","updated":"2023-10-02T02:06:48Z","published":"2023-03-14T16:19:44Z","title":"Beyond Demographic Parity: Redefining Equal Treatment","summary":"  Liberalism-oriented political philosophy reasons that all individuals should\nbe treated equally independently of their protected characteristics. Related\nwork in machine learning has translated the concept of \\emph{equal treatment}\ninto terms of \\emph{equal outcome} and measured it as \\emph{demographic parity}\n(also called \\emph{statistical parity}). Our analysis reveals that the two\nconcepts of equal outcome and equal treatment diverge; therefore, demographic\nparity does not faithfully represent the notion of \\emph{equal treatment}. We\npropose a new formalization for equal treatment by (i) considering the\ninfluence of feature values on predictions, such as computed by Shapley values\ndecomposing predictions across its features, (ii) defining distributions of\nexplanations, and (iii) comparing explanation distributions between populations\nwith different protected characteristics. We show the theoretical properties of\nour notion of equal treatment and devise a classifier two-sample test based on\nthe AUC of an equal treatment inspector. We study our formalization of equal\ntreatment on synthetic and natural data. We release \\texttt{explanationspace},\nan open-source Python package with methods and tutorials.\n","authors":["Carlos Mougan","Laura State","Antonio Ferrara","Salvatore Ruggieri","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2303.08040v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.12701v3","updated":"2023-10-02T01:38:45Z","published":"2021-09-26T20:49:16Z","title":"Sparse Plus Low Rank Matrix Decomposition: A Discrete Optimization\n  Approach","summary":"  We study the Sparse Plus Low-Rank decomposition problem (SLR), which is the\nproblem of decomposing a corrupted data matrix into a sparse matrix of\nperturbations plus a low-rank matrix containing the ground truth. SLR is a\nfundamental problem in Operations Research and Machine Learning which arises in\nvarious applications, including data compression, latent semantic indexing,\ncollaborative filtering, and medical imaging. We introduce a novel formulation\nfor SLR that directly models its underlying discreteness. For this formulation,\nwe develop an alternating minimization heuristic that computes high-quality\nsolutions and a novel semidefinite relaxation that provides meaningful bounds\nfor the solutions returned by our heuristic. We also develop a custom\nbranch-and-bound algorithm that leverages our heuristic and convex relaxations\nto solve small instances of SLR to certifiable (near) optimality. Given an\ninput $n$-by-$n$ matrix, our heuristic scales to solve instances where\n$n=10000$ in minutes, our relaxation scales to instances where $n=200$ in\nhours, and our branch-and-bound algorithm scales to instances where $n=25$ in\nminutes. Our numerical results demonstrate that our approach outperforms\nexisting state-of-the-art approaches in terms of rank, sparsity, and\nmean-square error while maintaining a comparable runtime.\n","authors":["Dimitris Bertsimas","Ryan Cory-Wright","Nicholas A. G. Johnson"],"pdf_url":"https://arxiv.org/pdf/2109.12701v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03601v3","updated":"2023-10-02T01:33:04Z","published":"2022-06-07T21:58:29Z","title":"Decoupled Self-supervised Learning for Non-Homophilous Graphs","summary":"  This paper studies the problem of conducting self-supervised learning for\nnode representation learning on graphs. Most existing self-supervised learning\nmethods assume the graph is homophilous, where linked nodes often belong to the\nsame class or have similar features. However, such assumptions of homophily do\nnot always hold in real-world graphs. We address this problem by developing a\ndecoupled self-supervised learning (DSSL) framework for graph neural networks.\nDSSL imitates a generative process of nodes and links from latent variable\nmodeling of the semantic structure, which decouples different underlying\nsemantics between different neighborhoods into the self-supervised learning\nprocess. Our DSSL framework is agnostic to the encoders and does not need\nprefabricated augmentations, thus is flexible to different graphs. To\neffectively optimize the framework, we derive the evidence lower bound of the\nself-supervised objective and develop a scalable training algorithm with\nvariational inference. We provide a theoretical analysis to justify that DSSL\nenjoys the better downstream performance. Extensive experiments on various\ntypes of graph benchmarks demonstrate that our proposed framework can achieve\nbetter performance compared with competitive baselines.\n","authors":["Teng Xiao","Zhengyu Chen","Zhimeng Guo","Zeyang Zhuang","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2206.03601v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08940v2","updated":"2023-10-02T00:55:29Z","published":"2023-01-21T11:30:13Z","title":"Quasi-optimal Reinforcement Learning with Continuous Actions","summary":"  Many real-world applications of reinforcement learning (RL) require making\ndecisions in continuous action environments. In particular, determining the\noptimal dose level plays a vital role in developing medical treatment regimes.\nOne challenge in adapting existing RL algorithms to medical applications,\nhowever, is that the popular infinite support stochastic policies, e.g.,\nGaussian policy, may assign riskily high dosages and harm patients seriously.\nHence, it is important to induce a policy class whose support only contains\nnear-optimal actions, and shrink the action-searching area for effectiveness\nand reliability. To achieve this, we develop a novel \\emph{quasi-optimal\nlearning algorithm}, which can be easily optimized in off-policy settings with\nguaranteed convergence under general function approximations. Theoretically, we\nanalyze the consistency, sample complexity, adaptability, and convergence of\nthe proposed algorithm. We evaluate our algorithm with comprehensive simulated\nexperiments and a dose suggestion real application to Ohio Type 1 diabetes\ndataset.\n","authors":["Yuhan Li","Wenzhuo Zhou","Ruoqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.08940v2.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2309.13278v2","updated":"2023-10-02T00:41:01Z","published":"2023-09-23T06:35:44Z","title":"Distributional Shift-Aware Off-Policy Interval Estimation: A Unified\n  Error Quantification Framework","summary":"  We study high-confidence off-policy evaluation in the context of\ninfinite-horizon Markov decision processes, where the objective is to establish\na confidence interval (CI) for the target policy value using only offline data\npre-collected from unknown behavior policies. This task faces two primary\nchallenges: providing a comprehensive and rigorous error quantification in CI\nestimation, and addressing the distributional shift that results from\ndiscrepancies between the distribution induced by the target policy and the\noffline data-generating process. Motivated by an innovative unified error\nanalysis, we jointly quantify the two sources of estimation errors: the\nmisspecification error on modeling marginalized importance weights and the\nstatistical uncertainty due to sampling, within a single interval. This unified\nframework reveals a previously hidden tradeoff between the errors, which\nundermines the tightness of the CI. Relying on a carefully designed\ndiscriminator function, the proposed estimator achieves a dual purpose:\nbreaking the curse of the tradeoff to attain the tightest possible CI, and\nadapting the CI to ensure robustness against distributional shifts. Our method\nis applicable to time-dependent data without assuming any weak dependence\nconditions via leveraging a local supermartingale/martingale structure.\nTheoretically, we show that our algorithm is sample-efficient, error-robust,\nand provably convergent even in non-linear function approximation settings. The\nnumerical performance of the proposed method is examined in synthetic datasets\nand an OhioT1DM mobile health study.\n","authors":["Wenzhuo Zhou","Yuhan Li","Ruoqing Zhu","Annie Qu"],"pdf_url":"https://arxiv.org/pdf/2309.13278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13459v2","updated":"2023-10-02T00:38:01Z","published":"2023-09-23T19:07:03Z","title":"A Model-Agnostic Graph Neural Network for Integrating Local and Global\n  Information","summary":"  Graph Neural Networks (GNNs) have achieved promising performance in a variety\nof graph-focused tasks. Despite their success, existing GNNs suffer from two\nsignificant limitations: a lack of interpretability in results due to their\nblack-box nature, and an inability to learn representations of varying orders.\nTo tackle these issues, we propose a novel Model-agnostic Graph Neural Network\n(MaGNet) framework, which is able to sequentially integrate information of\nvarious orders, extract knowledge from high-order neighbors, and provide\nmeaningful and interpretable results by identifying influential compact graph\nstructures. In particular, MaGNet consists of two components: an estimation\nmodel for the latent representation of complex relationships under graph\ntopology, and an interpretation model that identifies influential nodes, edges,\nand important node features. Theoretically, we establish the generalization\nerror bound for MaGNet via empirical Rademacher complexity, and showcase its\npower to represent layer-wise neighborhood mixing. We conduct comprehensive\nnumerical studies using simulated data to demonstrate the superior performance\nof MaGNet in comparison to several state-of-the-art alternatives. Furthermore,\nwe apply MaGNet to a real-world case study aimed at extracting task-critical\ninformation from brain activity data, thereby highlighting its effectiveness in\nadvancing scientific research.\n","authors":["Wenzhuo Zhou","Annie Qu","Keiland W. Cooper","Norbert Fortin","Babak Shahbaba"],"pdf_url":"https://arxiv.org/pdf/2309.13459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16188v2","updated":"2023-10-02T00:29:01Z","published":"2023-09-28T06:18:34Z","title":"Stackelberg Batch Policy Learning","summary":"  Batch reinforcement learning (RL) defines the task of learning from a fixed\nbatch of data lacking exhaustive exploration. Worst-case optimality algorithms,\nwhich calibrate a value-function model class from logged experience and perform\nsome type of pessimistic evaluation under the learned model, have emerged as a\npromising paradigm for batch RL. However, contemporary works on this stream\nhave commonly overlooked the hierarchical decision-making structure hidden in\nthe optimization landscape. In this paper, we adopt a game-theoretical\nviewpoint and model the policy learning diagram as a two-player general-sum\ngame with a leader-follower structure. We propose a novel stochastic\ngradient-based learning algorithm: StackelbergLearner, in which the leader\nplayer updates according to the total derivative of its objective instead of\nthe usual individual gradient, and the follower player makes individual updates\nand ensures transition-consistent pessimistic reasoning. The derived learning\ndynamic naturally lends StackelbergLearner to a game-theoretic interpretation\nand provides a convergence guarantee to differentiable Stackelberg equilibria.\nFrom a theoretical standpoint, we provide instance-dependent regret bounds with\ngeneral function approximation, which shows that our algorithm can learn a\nbest-effort policy that is able to compete against any comparator policy that\nis covered by batch data. Notably, our theoretical regret guarantees only\nrequire realizability without any data coverage and strong function\napproximation conditions, e.g., Bellman closedness, which is in contrast to\nprior works lacking such guarantees. Through comprehensive experiments, we find\nthat our algorithm consistently performs as well or better as compared to\nstate-of-the-art methods in batch RL benchmark and real-world datasets.\n","authors":["Wenzhuo Zhou","Annie Qu"],"pdf_url":"https://arxiv.org/pdf/2309.16188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06479v2","updated":"2023-10-02T00:15:31Z","published":"2023-06-10T16:36:22Z","title":"Learning a Neuron by a Shallow ReLU Network: Dynamics and Implicit Bias\n  for Correlated Inputs","summary":"  We prove that, for the fundamental regression task of learning a single\nneuron, training a one-hidden layer ReLU network of any width by gradient flow\nfrom a small initialisation converges to zero loss and is implicitly biased to\nminimise the rank of network parameters. By assuming that the training points\nare correlated with the teacher neuron, we complement previous work that\nconsidered orthogonal datasets. Our results are based on a detailed\nnon-asymptotic analysis of the dynamics of each hidden neuron throughout the\ntraining. We also show and characterise a surprising distinction in this\nsetting between interpolator networks of minimal rank and those of minimal\nEuclidean norm. Finally we perform a range of numerical experiments, which\ncorroborate our theoretical findings.\n","authors":["Dmitry Chistikov","Matthias Englert","Ranko Lazic"],"pdf_url":"https://arxiv.org/pdf/2306.06479v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.01596v1","updated":"2023-10-02T19:41:42Z","published":"2023-10-02T19:41:42Z","title":"ImagenHub: Standardizing the evaluation of conditional image generation\n  models","summary":"  Recently, a myriad of conditional image generation and editing models have\nbeen developed to serve different downstream tasks, including text-to-image\ngeneration, text-guided image editing, subject-driven image generation,\ncontrol-guided image generation, etc. However, we observe huge inconsistencies\nin experimental conditions: datasets, inference, and evaluation metrics -\nrender fair comparisons difficult. This paper proposes ImagenHub, which is a\none-stop library to standardize the inference and evaluation of all the\nconditional image generation models. Firstly, we define seven prominent tasks\nand curate high-quality evaluation datasets for them. Secondly, we built a\nunified inference pipeline to ensure fair comparison. Thirdly, we design two\nhuman evaluation scores, i.e. Semantic Consistency and Perceptual Quality,\nalong with comprehensive guidelines to evaluate generated images. We train\nexpert raters to evaluate the model outputs based on the proposed metrics. Our\nhuman evaluation achieves a high inter-worker agreement of Krippendorff's alpha\non 76% models with a value higher than 0.4. We comprehensively evaluated a\ntotal of around 30 models and observed three key takeaways: (1) the existing\nmodels' performance is generally unsatisfying except for Text-guided Image\nGeneration and Subject-driven Image Generation, with 74% models achieving an\noverall score lower than 0.5. (2) we examined the claims from published papers\nand found 83% of them hold with a few exceptions. (3) None of the existing\nautomatic metrics has a Spearman's correlation higher than 0.2 except\nsubject-driven image generation. Moving forward, we will continue our efforts\nto evaluate newly published models and update our leaderboard to keep track of\nthe progress in conditional image generation.\n","authors":["Max Ku","Tianle Li","Kai Zhang","Yujie Lu","Xingyu Fu","Wenwen Zhuang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04603v2","updated":"2023-10-02T17:43:23Z","published":"2023-08-08T22:06:14Z","title":"A Brief Yet In-Depth Survey of Deep Learning-Based Image Watermarking","summary":"  This paper presents a comprehensive survey on deep learning-based image\nwatermarking, a technique that entails the invisible embedding and extraction\nof watermarks within a cover image, aiming to offer a seamless blend of\nrobustness and adaptability. We navigate the complex landscape of this\ninterdisciplinary domain, linking historical foundations, current innovations,\nand prospective developments. Unlike existing literature, our study\nconcentrates exclusively on image watermarking with deep learning, delivering\nan in-depth, yet brief analysis enriched by three fundamental contributions.\nFirst, we introduce a refined categorization, segmenting the field into\nEmbedder-Extractor, Deep Networks as a Feature Transformation, and Hybrid\nMethods. This taxonomy, inspired by the varied roles of deep learning across\nstudies, is designed to infuse clarity, offering readers technical insights and\ndirectional guidance. Second, our exploration dives into representative\nmethodologies, encapsulating the diverse research directions and inherent\nchallenges within each category to provide a consolidated perspective. Lastly,\nwe venture beyond established boundaries to outline emerging frontiers,\noffering a detailed insight into prospective research avenues.\n","authors":["Xin Zhong","Arjon Das","Fahad Alrasheedi","Abdullah Tanvir"],"pdf_url":"https://arxiv.org/pdf/2308.04603v2.pdf","comment":null}]},"2023-10-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.01544v2","updated":"2023-10-01T23:24:13Z","published":"2023-08-03T05:27:12Z","title":"Multimodal Neurons in Pretrained Text-Only Transformers","summary":"  Language models demonstrate remarkable capacity to generalize representations\nlearned in one modality to downstream tasks in other modalities. Can we trace\nthis ability to individual neurons? We study the case where a frozen text\ntransformer is augmented with vision using a self-supervised visual encoder and\na single linear projection learned on an image-to-text task. Outputs of the\nprojection layer are not immediately decodable into language describing image\ncontent; instead, we find that translation between modalities occurs deeper\nwithin the transformer. We introduce a procedure for identifying \"multimodal\nneurons\" that convert visual representations into corresponding text, and\ndecoding the concepts they inject into the model's residual stream. In a series\nof experiments, we show that multimodal neurons operate on specific visual\nconcepts across inputs, and have a systematic causal effect on image\ncaptioning.\n","authors":["Sarah Schwettmann","Neil Chowdhury","Samuel Klein","David Bau","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2308.01544v2.pdf","comment":"Oral presentation at ICCV CLVL 2023"},{"id":"http://arxiv.org/abs/2211.09110v2","updated":"2023-10-01T21:44:23Z","published":"2022-11-16T18:51:34Z","title":"Holistic Evaluation of Language Models","summary":"  Language models (LMs) are becoming the foundation for almost all major\nlanguage technologies, but their capabilities, limitations, and risks are not\nwell understood. We present Holistic Evaluation of Language Models (HELM) to\nimprove the transparency of language models. First, we taxonomize the vast\nspace of potential scenarios (i.e. use cases) and metrics (i.e. desiderata)\nthat are of interest for LMs. Then we select a broad subset based on coverage\nand feasibility, noting what's missing or underrepresented (e.g. question\nanswering for neglected English dialects, metrics for trustworthiness). Second,\nwe adopt a multi-metric approach: We measure 7 metrics (accuracy, calibration,\nrobustness, fairness, bias, toxicity, and efficiency) for each of 16 core\nscenarios when possible (87.5% of the time). This ensures metrics beyond\naccuracy don't fall to the wayside, and that trade-offs are clearly exposed. We\nalso perform 7 targeted evaluations, based on 26 targeted scenarios, to analyze\nspecific aspects (e.g. reasoning, disinformation). Third, we conduct a\nlarge-scale evaluation of 30 prominent language models (spanning open,\nlimited-access, and closed models) on all 42 scenarios, 21 of which were not\npreviously used in mainstream LM evaluation. Prior to HELM, models on average\nwere evaluated on just 17.9% of the core HELM scenarios, with some prominent\nmodels not sharing a single scenario in common. We improve this to 96.0%: now\nall 30 models have been densely benchmarked on the same core scenarios and\nmetrics under standardized conditions. Our evaluation surfaces 25 top-level\nfindings. For full transparency, we release all raw model prompts and\ncompletions publicly for further analysis, as well as a general modular\ntoolkit. We intend for HELM to be a living benchmark for the community,\ncontinuously updated with new scenarios, metrics, and models.\n","authors":["Percy Liang","Rishi Bommasani","Tony Lee","Dimitris Tsipras","Dilara Soylu","Michihiro Yasunaga","Yian Zhang","Deepak Narayanan","Yuhuai Wu","Ananya Kumar","Benjamin Newman","Binhang Yuan","Bobby Yan","Ce Zhang","Christian Cosgrove","Christopher D. Manning","Christopher Ré","Diana Acosta-Navas","Drew A. Hudson","Eric Zelikman","Esin Durmus","Faisal Ladhak","Frieda Rong","Hongyu Ren","Huaxiu Yao","Jue Wang","Keshav Santhanam","Laurel Orr","Lucia Zheng","Mert Yuksekgonul","Mirac Suzgun","Nathan Kim","Neel Guha","Niladri Chatterji","Omar Khattab","Peter Henderson","Qian Huang","Ryan Chi","Sang Michael Xie","Shibani Santurkar","Surya Ganguli","Tatsunori Hashimoto","Thomas Icard","Tianyi Zhang","Vishrav Chaudhary","William Wang","Xuechen Li","Yifan Mai","Yuhui Zhang","Yuta Koreeda"],"pdf_url":"https://arxiv.org/pdf/2211.09110v2.pdf","comment":"Authored by the Center for Research on Foundation Models (CRFM) at\n  the Stanford Institute for Human-Centered Artificial Intelligence (HAI).\n  Project page: https://crfm.stanford.edu/helm/v1.0"},{"id":"http://arxiv.org/abs/2305.09770v5","updated":"2023-10-01T20:00:06Z","published":"2023-05-16T19:48:49Z","title":"ConvXAI: Delivering Heterogeneous AI Explanations via Conversations to\n  Support Human-AI Scientific Writing","summary":"  Despite a surge collection of XAI methods, users still struggle to obtain\nrequired AI explanations. Previous research suggests chatbots as dynamic\nsolutions, but the effective design of conversational XAI agents for practical\nhuman needs remains under-explored. This paper focuses on Conversational XAI\nfor AI-assisted scientific writing tasks. Drawing from human linguistic\ntheories and formative studies, we identify four design rationales:\n\"multifaceted\", \"controllability\", \"mix-initiative\", \"context-aware\ndrill-down\". We incorporate them into an interactive prototype, ConvXAI, which\nfacilitates heterogeneous AI explanations for scientific writing through\ndialogue. In two studies with 21 users, ConvXAI outperforms a GUI-based\nbaseline on improving human-perceived understanding and writing improvement.\nThe paper further discusses the practical human usage patterns in interacting\nwith ConvXAI for scientific co-writing.\n","authors":["Hua Shen","Chieh-Yang Huang","Tongshuang Wu","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2305.09770v5.pdf","comment":"To appear in CSCW 2023 Demo. ConvXAI system code:\n  https://github.com/huashen218/convxai.git"},{"id":"http://arxiv.org/abs/2004.08249v3","updated":"2023-10-01T18:34:20Z","published":"2020-04-17T13:59:07Z","title":"Understanding the Difficulty of Training Transformers","summary":"  Transformers have proved effective in many NLP tasks. However, their training\nrequires non-trivial efforts regarding designing cutting-edge optimizers and\nlearning rate schedulers carefully (e.g., conventional SGD fails to train\nTransformers effectively). Our objective here is to understand $\\textit{what\ncomplicates Transformer training}$ from both empirical and theoretical\nperspectives. Our analysis reveals that unbalanced gradients are not the root\ncause of the instability of training. Instead, we identify an amplification\neffect that influences training substantially -- for each layer in a\nmulti-layer Transformer model, heavy dependency on its residual branch makes\ntraining unstable, since it amplifies small parameter perturbations (e.g.,\nparameter updates) and results in significant disturbances in the model output.\nYet we observe that a light dependency limits the model potential and leads to\ninferior trained models. Inspired by our analysis, we propose Admin\n($\\textbf{Ad}$aptive $\\textbf{m}$odel $\\textbf{in}$itialization) to stabilize\nstabilize the early stage's training and unleash its full potential in the late\nstage. Extensive experiments show that Admin is more stable, converges faster,\nand leads to better performance. Implementations are released at:\nhttps://github.com/LiyuanLucasLiu/Transforemr-Clinic.\n","authors":["Liyuan Liu","Xiaodong Liu","Jianfeng Gao","Weizhu Chen","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2004.08249v3.pdf","comment":"EMNLP 2020"},{"id":"http://arxiv.org/abs/2309.10706v2","updated":"2023-10-01T18:21:58Z","published":"2023-09-19T15:46:40Z","title":"OpenBA: An Open-sourced 15B Bilingual Asymmetric seq2seq Model\n  Pre-trained from Scratch","summary":"  Large language models (LLMs) with billions of parameters have demonstrated\noutstanding performance on various natural language processing tasks. This\nreport presents OpenBA, an open-sourced 15B bilingual asymmetric seq2seq model,\nto contribute an LLM variant to the Chinese-oriented open-source model\ncommunity. We enhance OpenBA with effective and efficient techniques as well as\nadopt a three-stage training strategy to train the model from scratch. Our\nsolution can also achieve very competitive performance with only 380B tokens,\nwhich is better than LLaMA-70B on the BELEBELE benchmark, BLOOM-176B on the\nMMLU benchmark, GLM-130B on the C-Eval (hard) benchmark. This report provides\nthe main details to pre-train an analogous model, including pre-training data\nprocessing, Bilingual Flan data collection, the empirical observations that\ninspire our model architecture design, training objectives of different stages,\nand other enhancement techniques. Additionally, we also provide the fine-tuning\ndetails of OpenBA on four downstream tasks. We have refactored our code to\nfollow the design principles of the Huggingface Transformers Library, making it\nmore convenient for developers to use, and released checkpoints of different\ntraining stages at https://huggingface.co/openBA. More details of our project\nare available at https://github.com/OpenNLG/openBA.git.\n","authors":["Juntao Li","Zecheng Tang","Yuyang Ding","Pinzheng Wang","Pei Guo","Wangjie You","Dan Qiao","Wenliang Chen","Guohong Fu","Qiaoming Zhu","Guodong Zhou","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.10706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05653v2","updated":"2023-10-01T15:25:41Z","published":"2023-09-11T17:47:22Z","title":"MAmmoTH: Building Math Generalist Models through Hybrid Instruction\n  Tuning","summary":"  We introduce MAmmoTH, a series of open-source large language models (LLMs)\nspecifically tailored for general math problem-solving. The MAmmoTH models are\ntrained on MathInstruct, our meticulously curated instruction tuning dataset.\nMathInstruct is compiled from 13 math datasets with intermediate rationales,\nsix of which have rationales newly curated by us. It presents a unique hybrid\nof chain-of-thought (CoT) and program-of-thought (PoT) rationales, and also\nensures extensive coverage of diverse fields in math. The hybrid of CoT and PoT\nnot only unleashes the potential of tool use but also allows different thought\nprocesses for different math problems. As a result, the MAmmoTH series\nsubstantially outperform existing open-source models on nine mathematical\nreasoning datasets across all scales with an average accuracy gain between 16%\nand 32%. Remarkably, our MAmmoTH-7B model reaches 33% on MATH (a\ncompetition-level dataset), which exceeds the best open-source 7B model\n(WizardMath) by 23%, and the MAmmoTH-34B model achieves 44% accuracy on MATH,\neven surpassing GPT-4's CoT result. Our work underscores the importance of\ndiverse problem coverage and the use of hybrid rationales in developing\nsuperior math generalist models.\n","authors":["Xiang Yue","Xingwei Qu","Ge Zhang","Yao Fu","Wenhao Huang","Huan Sun","Yu Su","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05653v2.pdf","comment":"Work in progress; Xiang Yue and Wenhu Chen contributed equally to\n  this paper"},{"id":"http://arxiv.org/abs/2211.06774v3","updated":"2023-10-01T13:59:25Z","published":"2022-11-13T00:09:36Z","title":"Large-Scale Bidirectional Training for Zero-Shot Image Captioning","summary":"  When trained on large-scale datasets, image captioning models can understand\nthe content of images from a general domain but often fail to generate\naccurate, detailed captions. To improve performance, pretraining-and-finetuning\nhas been a key strategy for image captioning. However, we find that large-scale\nbidirectional training between image and text enables zero-shot image\ncaptioning. In this paper, we introduce Bidirectional Image Text Training in\nlargER Scale, BITTERS, an efficient training and inference framework for\nzero-shot image captioning. We also propose a new evaluation benchmark which\ncomprises of high quality datasets and an extensive set of metrics to properly\nevaluate zero-shot captioning accuracy and societal bias. We additionally\nprovide an efficient finetuning approach for keyword extraction. We show that\ncareful selection of large-scale training set and model architecture is the key\nto achieving zero-shot image captioning.\n","authors":["Taehoon Kim","Mark Marsden","Pyunghwan Ahn","Sangyun Kim","Sihaeng Lee","Alessandra Sala","Seung Hwan Kim"],"pdf_url":"https://arxiv.org/pdf/2211.06774v3.pdf","comment":"Arxiv Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2306.05079v2","updated":"2023-10-01T13:01:07Z","published":"2023-06-08T10:02:04Z","title":"Enhancing Robustness of AI Offensive Code Generators via Data\n  Augmentation","summary":"  In this work, we present a method to add perturbations to the code\ndescriptions to create new inputs in natural language (NL) from\nwell-intentioned developers that diverge from the original ones due to the use\nof new words or because they miss part of them. The goal is to analyze how and\nto what extent perturbations affect the performance of AI code generators in\nthe context of security-oriented code. First, we show that perturbed\ndescriptions preserve the semantics of the original, non-perturbed ones. Then,\nwe use the method to assess the robustness of three state-of-the-art code\ngenerators against the newly perturbed inputs, showing that the performance of\nthese AI-based solutions is highly affected by perturbations in the NL\ndescriptions. To enhance their robustness, we use the method to perform data\naugmentation, i.e., to increase the variability and diversity of the NL\ndescriptions in the training data, proving its effectiveness against both\nperturbed and non-perturbed code descriptions.\n","authors":["Cristina Improta","Pietro Liguori","Roberto Natella","Bojan Cukic","Domenico Cotroneo"],"pdf_url":"https://arxiv.org/pdf/2306.05079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07778v2","updated":"2023-10-01T10:38:39Z","published":"2023-02-15T16:55:15Z","title":"Measuring the Instability of Fine-Tuning","summary":"  Fine-tuning pre-trained language models on downstream tasks with varying\nrandom seeds has been shown to be unstable, especially on small datasets. Many\nprevious studies have investigated this instability and proposed methods to\nmitigate it. However, most studies only used the standard deviation of\nperformance scores (SD) as their measure, which is a narrow characterization of\ninstability. In this paper, we analyze SD and six other measures quantifying\ninstability at different levels of granularity. Moreover, we propose a\nsystematic framework to evaluate the validity of these measures. Finally, we\nanalyze the consistency and difference between different measures by\nreassessing existing instability mitigation methods. We hope our results will\ninform the development of better measurements of fine-tuning instability.\n","authors":["Yupei Du","Dong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2302.07778v2.pdf","comment":"20 pages, 26 Figures, accepted to ACL 2023 main conference"},{"id":"http://arxiv.org/abs/2307.12856v2","updated":"2023-10-01T10:30:27Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web automation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that learns from self-experience to\ncomplete tasks on real websites following natural language instructions.\nWebAgent plans ahead by decomposing instructions into canonical\nsub-instructions, summarizes long HTML documents into task-relevant snippets,\nand acts on websites via Python programs generated from those. We design\nWebAgent with Flan-U-PaLM, for grounded code generation, and HTML-T5, new\npre-trained LLMs for long HTML documents using local and global attention\nmechanisms and a mixture of long-span denoising objectives, for planning and\nsummarization. We empirically demonstrate that our modular recipe improves the\nsuccess on real websites by over 50%, and that HTML-T5 is the best model to\nsolve various HTML understanding tasks; achieving 18.7% higher success rate\nthan the prior method on MiniWoB web automation benchmark, and SoTA performance\non Mind2Web, an offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00456v2","updated":"2023-10-01T10:27:24Z","published":"2023-02-01T13:59:47Z","title":"Analyzing Feed-Forward Blocks in Transformers through the Lens of\n  Attention Map","summary":"  Given that Transformers are ubiquitous in wide tasks, interpreting their\ninternals is a pivotal issue. Still, their particular components, feed-forward\n(FF) blocks, have typically been less analyzed despite their substantial\nparameter amounts. We analyze the input contextualization effects of FF blocks\nby rendering them in the attention maps as a human-friendly visualization\nscheme. Our experiments with both masked- and causal-language models reveal\nthat FF networks modify the input contextualization to emphasize specific types\nof linguistic compositions. In addition, FF and its surrounding components tend\nto cancel out each other's effects, suggesting potential redundancy in the\nprocessing of the Transformer layer.\n","authors":["Goro Kobayashi","Tatsuki Kuribayashi","Sho Yokoi","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2302.00456v2.pdf","comment":"29 pages, 25 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.08493v2","updated":"2023-10-01T09:11:03Z","published":"2023-08-16T16:48:57Z","title":"Time Travel in LLMs: Tracing Data Contamination in Large Language Models","summary":"  Data contamination, i.e., the presence of test data from downstream tasks in\nthe training data of large language models (LLMs), is a potential major issue\nin measuring LLMs' real effectiveness on other tasks. We propose a\nstraightforward yet effective method for identifying data contamination within\nLLMs. At its core, our approach starts by identifying potential contamination\nat the instance level; using this information, our approach then assesses wider\ncontamination at the partition level. To estimate contamination of individual\ninstances, we employ \"guided instruction:\" a prompt consisting of the dataset\nname, partition type, and the random-length initial segment of a reference\ninstance, asking the LLM to complete it. An instance is flagged as contaminated\nif the LLM's output either exactly or nearly matches the latter segment of the\nreference. To understand if an entire partition is contaminated, we propose two\nideas. The first idea marks a dataset partition as contaminated if the average\noverlap score with the reference instances (as measured by ROUGE-L or BLEURT)\nis statistically significantly better with the completions from guided\ninstruction compared to a \"general instruction\" that does not include the\ndataset and partition name. The second idea marks a dataset partition as\ncontaminated if a classifier based on GPT-4 with few-shot in-context learning\nprompt marks multiple generated completions as exact/near-exact matches of the\ncorresponding reference instances. Our best method achieves an accuracy between\n92% and 100% in detecting if an LLM is contaminated with seven datasets,\ncontaining train and test/validation partitions, when contrasted with manual\nevaluation by human experts. Further, our findings indicate that GPT-4 is\ncontaminated with AG News, WNLI, and XSum datasets.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2308.08493v2.pdf","comment":"v2 preprint"},{"id":"http://arxiv.org/abs/2305.15852v2","updated":"2023-10-01T07:22:39Z","published":"2023-05-25T08:43:46Z","title":"Self-contradictory Hallucinations of Large Language Models: Evaluation,\n  Detection and Mitigation","summary":"  Large language models (large LMs) are susceptible to producing text that\ncontains hallucinated content. An important instance of this problem is\nself-contradiction, where the LM generates two contradictory sentences within\nthe same context. In this work, we present a comprehensive investigation into\nself-contradiction for various instruction-tuned LMs, covering evaluation,\ndetection, and mitigation. Our analysis reveals the prevalence of\nself-contradictions when LMs generate text for open-domain topics, e.g., in\n17.7% of all sentences produced by ChatGPT. Self-contradiction also complements\nretrieval-based methods, as a large portion of them (e.g., 35.8% for ChatGPT)\ncannot be verified using Wikipedia. We then propose a novel prompting-based\nframework designed to effectively detect and mitigate self-contradictions. Our\ndetector achieves high accuracy, e.g., around 80% F1 score when prompting\nChatGPT. The mitigation algorithm iteratively refines the generated text to\nremove contradictory information while preserving text fluency and\ninformativeness. Importantly, our entire framework is applicable to black-box\nLMs and does not require external grounded knowledge. Our approach is\npractically effective and has been released as a push-button tool to benefit\nthe public, available at https://chatprotect.ai/.\n","authors":["Niels Mündler","Jingxuan He","Slobodan Jenko","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2305.15852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05494v2","updated":"2023-10-01T05:29:11Z","published":"2023-09-11T14:36:16Z","title":"CrisisTransformers: Pre-trained language models and sentence encoders\n  for crisis-related social media texts","summary":"  Social media platforms play an essential role in crisis communication, but\nanalyzing crisis-related social media texts is challenging due to their\ninformal nature. Transformer-based pre-trained models like BERT and RoBERTa\nhave shown success in various NLP tasks, but they are not tailored for\ncrisis-related texts. Furthermore, general-purpose sentence encoders are used\nto generate sentence embeddings, regardless of the textual complexities in\ncrisis-related texts. Advances in applications like text classification,\nsemantic search, and clustering contribute to effective processing of\ncrisis-related texts, which is essential for emergency responders to gain a\ncomprehensive view of a crisis event, whether historical or real-time. To\naddress these gaps in crisis informatics literature, this study introduces\nCrisisTransformers, an ensemble of pre-trained language models and sentence\nencoders trained on an extensive corpus of over 15 billion word tokens from\ntweets associated with more than 30 crisis events, including disease outbreaks,\nnatural disasters, conflicts, and other critical incidents. We evaluate\nexisting models and CrisisTransformers on 18 crisis-specific public datasets.\nOur pre-trained models outperform strong baselines across all datasets in\nclassification tasks, and our best-performing sentence encoder improves the\nstate-of-the-art by 17.43% in sentence encoding tasks. Additionally, we\ninvestigate the impact of model initialization on convergence and evaluate the\nsignificance of domain-specific models in generating semantically meaningful\nsentence embeddings. All models are publicly released\n(https://huggingface.co/crisistransformers), with the anticipation that they\nwill serve as a robust baseline for tasks involving the analysis of\ncrisis-related social media texts.\n","authors":["Rabindra Lamsal","Maria Rodriguez Read","Shanika Karunasekera"],"pdf_url":"https://arxiv.org/pdf/2309.05494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01026v2","updated":"2023-10-01T02:57:42Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n  Multimodal Nudging","summary":"  We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel M. Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.01544v2","updated":"2023-10-01T23:24:13Z","published":"2023-08-03T05:27:12Z","title":"Multimodal Neurons in Pretrained Text-Only Transformers","summary":"  Language models demonstrate remarkable capacity to generalize representations\nlearned in one modality to downstream tasks in other modalities. Can we trace\nthis ability to individual neurons? We study the case where a frozen text\ntransformer is augmented with vision using a self-supervised visual encoder and\na single linear projection learned on an image-to-text task. Outputs of the\nprojection layer are not immediately decodable into language describing image\ncontent; instead, we find that translation between modalities occurs deeper\nwithin the transformer. We introduce a procedure for identifying \"multimodal\nneurons\" that convert visual representations into corresponding text, and\ndecoding the concepts they inject into the model's residual stream. In a series\nof experiments, we show that multimodal neurons operate on specific visual\nconcepts across inputs, and have a systematic causal effect on image\ncaptioning.\n","authors":["Sarah Schwettmann","Neil Chowdhury","Samuel Klein","David Bau","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2308.01544v2.pdf","comment":"Oral presentation at ICCV CLVL 2023"},{"id":"http://arxiv.org/abs/2211.11296v2","updated":"2023-10-01T23:22:50Z","published":"2022-11-21T09:38:30Z","title":"SeeABLE: Soft Discrepancies and Bounded Contrastive Learning for\n  Exposing Deepfakes","summary":"  Modern deepfake detectors have achieved encouraging results, when training\nand test images are drawn from the same data collection. However, when these\ndetectors are applied to images produced with unknown deepfake-generation\ntechniques, considerable performance degradations are commonly observed. In\nthis paper, we propose a novel deepfake detector, called SeeABLE, that\nformalizes the detection problem as a (one-class) out-of-distribution detection\ntask and generalizes better to unseen deepfakes. Specifically, SeeABLE first\ngenerates local image perturbations (referred to as soft-discrepancies) and\nthen pushes the perturbed faces towards predefined prototypes using a novel\nregression-based bounded contrastive loss. To strengthen the generalization\nperformance of SeeABLE to unknown deepfake types, we generate a rich set of\nsoft discrepancies and train the detector: (i) to localize, which part of the\nface was modified, and (ii) to identify the alteration type. To demonstrate the\ncapabilities of SeeABLE, we perform rigorous experiments on several widely-used\ndeepfake datasets and show that our model convincingly outperforms competing\nstate-of-the-art detectors, while exhibiting highly encouraging generalization\ncapabilities.\n","authors":["Nicolas Larue","Ngoc-Son Vu","Vitomir Struc","Peter Peer","Vassilis Christophides"],"pdf_url":"https://arxiv.org/pdf/2211.11296v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2301.03213v5","updated":"2023-10-01T22:54:53Z","published":"2023-01-09T09:10:35Z","title":"EgoTracks: A Long-term Egocentric Visual Object Tracking Dataset","summary":"  Visual object tracking is a key component to many egocentric vision problems.\nHowever, the full spectrum of challenges of egocentric tracking faced by an\nembodied AI is underrepresented in many existing datasets; these tend to focus\non relatively short, third-person videos. Egocentric video has several\ndistinguishing characteristics from those commonly found in past datasets:\nfrequent large camera motions and hand interactions with objects commonly lead\nto occlusions or objects exiting the frame, and object appearance can change\nrapidly due to widely different points of view, scale, or object states.\nEmbodied tracking is also naturally long-term, and being able to consistently\n(re-)associate objects to their appearances and disappearances over as long as\na lifetime is critical. Previous datasets under-emphasize this re-detection\nproblem, and their \"framed\" nature has led to adoption of various\nspatiotemporal priors that we find do not necessarily generalize to egocentric\nvideo. We thus introduce EgoTracks, a new dataset for long-term egocentric\nvisual object tracking. Sourced from the Ego4D dataset, this new dataset\npresents a significant challenge to recent state-of-the-art single-object\ntracking models, which we find score poorly on traditional tracking metrics for\nour new dataset, compared to popular benchmarks. We further show improvements\nthat can be made to a STARK tracker to significantly increase its performance\non egocentric data, resulting in a baseline model we call EgoSTARK. We publicly\nrelease our annotations and benchmark, hoping our dataset leads to further\nadvancements in tracking.\n","authors":["Hao Tang","Kevin Liang","Matt Feiszli","Weiyao Wang"],"pdf_url":"https://arxiv.org/pdf/2301.03213v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13301v3","updated":"2023-10-01T22:07:12Z","published":"2023-05-22T17:57:41Z","title":"Training Diffusion Models with Reinforcement Learning","summary":"  Diffusion models are a class of flexible generative models trained with an\napproximation to the log-likelihood objective. However, most use cases of\ndiffusion models are not concerned with likelihoods, but instead with\ndownstream objectives such as human-perceived image quality or drug\neffectiveness. In this paper, we investigate reinforcement learning methods for\ndirectly optimizing diffusion models for such objectives. We describe how\nposing denoising as a multi-step decision-making problem enables a class of\npolicy gradient algorithms, which we refer to as denoising diffusion policy\noptimization (DDPO), that are more effective than alternative reward-weighted\nlikelihood approaches. Empirically, DDPO is able to adapt text-to-image\ndiffusion models to objectives that are difficult to express via prompting,\nsuch as image compressibility, and those derived from human feedback, such as\naesthetic quality. Finally, we show that DDPO can improve prompt-image\nalignment using feedback from a vision-language model without the need for\nadditional data collection or human annotation. The project's website can be\nfound at http://rl-diffusion.github.io .\n","authors":["Kevin Black","Michael Janner","Yilun Du","Ilya Kostrikov","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2305.13301v3.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2209.04851v2","updated":"2023-10-01T21:31:09Z","published":"2022-09-11T12:46:01Z","title":"OpenMixup: A Comprehensive Mixup Benchmark for Visual Classification","summary":"  Data mixing, or mixup, is a data-dependent augmentation technique that has\ngreatly enhanced the generalizability of modern deep neural networks. However,\na full grasp of mixup methodology necessitates a top-down hierarchical\nunderstanding from systematic impartial evaluations and empirical analysis,\nboth of which are currently lacking within the community. In this paper, we\npresent OpenMixup, the first comprehensive mixup benchmarking study for\nsupervised visual classification. OpenMixup offers a unified mixup-based model\ndesign and training framework, encompassing a wide collection of data mixing\nalgorithms, a diverse range of widely-used backbones and modules, and a set of\nmodel analysis toolkits. To ensure fair and complete comparisons, large-scale\nstandard evaluations of various mixup baselines are conducted across 12\ndiversified image datasets with meticulous confounders and tweaking powered by\nour modular and extensible codebase framework. Interesting observations and\ninsights are derived through detailed empirical analysis of how mixup policies,\nnetwork architectures, and dataset properties affect the mixup visual\nclassification performance. We hope that OpenMixup can bolster the\nreproducibility of previously gained insights and facilitate a better\nunderstanding of mixup properties, thereby giving the community a kick-start\nfor the development and evaluation of new mixup methods. The source code and\nuser documents are available at \\url{https://github.com/Westlake-AI/openmixup}.\n","authors":["Siyuan Li","Zedong Wang","Zicheng Liu","Di Wu","Cheng Tan","Weiyang Jin","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2209.04851v2.pdf","comment":"Preprint V2. The source code is available at\n  https://github.com/Westlake-AI/openmixup"},{"id":"http://arxiv.org/abs/2308.13561v3","updated":"2023-10-01T20:16:22Z","published":"2023-08-24T20:42:21Z","title":"Project Aria: A New Tool for Egocentric Multi-Modal AI Research","summary":"  Egocentric, multi-modal data as available on future augmented reality (AR)\ndevices provides unique challenges and opportunities for machine perception.\nThese future devices will need to be all-day wearable in a socially acceptable\nform-factor to support always available, context-aware and personalized AI\napplications. Our team at Meta Reality Labs Research built the Aria device, an\negocentric, multi-modal data recording and streaming device with the goal to\nfoster and accelerate research in this area. In this paper, we describe the\nAria device hardware including its sensor configuration and the corresponding\nsoftware tools that enable recording and processing of such data.\n","authors":["Jakob Engel","Kiran Somasundaram","Michael Goesele","Albert Sun","Alexander Gamino","Andrew Turner","Arjang Talattof","Arnie Yuan","Bilal Souti","Brighid Meredith","Cheng Peng","Chris Sweeney","Cole Wilson","Dan Barnes","Daniel DeTone","David Caruso","Derek Valleroy","Dinesh Ginjupalli","Duncan Frost","Edward Miller","Elias Mueggler","Evgeniy Oleinik","Fan Zhang","Guruprasad Somasundaram","Gustavo Solaira","Harry Lanaras","Henry Howard-Jenkins","Huixuan Tang","Hyo Jin Kim","Jaime Rivera","Ji Luo","Jing Dong","Julian Straub","Kevin Bailey","Kevin Eckenhoff","Lingni Ma","Luis Pesqueira","Mark Schwesinger","Maurizio Monge","Nan Yang","Nick Charron","Nikhil Raina","Omkar Parkhi","Peter Borschowa","Pierre Moulon","Prince Gupta","Raul Mur-Artal","Robbie Pennington","Sachin Kulkarni","Sagar Miglani","Santosh Gondi","Saransh Solanki","Sean Diener","Shangyi Cheng","Simon Green","Steve Saarinen","Suvam Patra","Tassos Mourikis","Thomas Whelan","Tripti Singh","Vasileios Balntas","Vijay Baiyya","Wilson Dreewes","Xiaqing Pan","Yang Lou","Yipu Zhao","Yusuf Mansour","Yuyang Zou","Zhaoyang Lv","Zijian Wang","Mingfei Yan","Carl Ren","Renzo De Nardi","Richard Newcombe"],"pdf_url":"https://arxiv.org/pdf/2308.13561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.09658v4","updated":"2023-10-01T19:19:03Z","published":"2021-09-20T16:22:49Z","title":"FUTURE-AI: Guiding Principles and Consensus Recommendations for\n  Trustworthy Artificial Intelligence in Medical Imaging","summary":"  The recent advancements in artificial intelligence (AI) combined with the\nextensive amount of data generated by today's clinical systems, has led to the\ndevelopment of imaging AI solutions across the whole value chain of medical\nimaging, including image reconstruction, medical image segmentation,\nimage-based diagnosis and treatment planning. Notwithstanding the successes and\nfuture potential of AI in medical imaging, many stakeholders are concerned of\nthe potential risks and ethical implications of imaging AI solutions, which are\nperceived as complex, opaque, and difficult to comprehend, utilise, and trust\nin critical clinical applications. Despite these concerns and risks, there are\ncurrently no concrete guidelines and best practices for guiding future AI\ndevelopments in medical imaging towards increased trust, safety and adoption.\nTo bridge this gap, this paper introduces a careful selection of guiding\nprinciples drawn from the accumulated experiences, consensus, and best\npractices from five large European projects on AI in Health Imaging. These\nguiding principles are named FUTURE-AI and its building blocks consist of (i)\nFairness, (ii) Universality, (iii) Traceability, (iv) Usability, (v) Robustness\nand (vi) Explainability. In a step-by-step approach, these guidelines are\nfurther translated into a framework of concrete recommendations for specifying,\ndeveloping, evaluating, and deploying technically, clinically and ethically\ntrustworthy AI solutions into clinical practice.\n","authors":["Karim Lekadir","Richard Osuala","Catherine Gallin","Noussair Lazrak","Kaisar Kushibar","Gianna Tsakou","Susanna Aussó","Leonor Cerdá Alberich","Kostas Marias","Manolis Tsiknakis","Sara Colantonio","Nickolas Papanikolaou","Zohaib Salahuddin","Henry C Woodruff","Philippe Lambin","Luis Martí-Bonmatí"],"pdf_url":"https://arxiv.org/pdf/2109.09658v4.pdf","comment":"Please refer to arXiv:2309.12325 for the latest FUTURE-AI framework\n  for healthcare"},{"id":"http://arxiv.org/abs/2305.15700v4","updated":"2023-10-01T19:03:09Z","published":"2023-05-25T04:16:07Z","title":"Fairness Continual Learning Approach to Semantic Scene Understanding in\n  Open-World Environments","summary":"  Continual semantic segmentation aims to learn new classes while maintaining\nthe information from the previous classes. Although prior studies have shown\nimpressive progress in recent years, the fairness concern in the continual\nsemantic segmentation needs to be better addressed. Meanwhile, fairness is one\nof the most vital factors in deploying the deep learning model, especially in\nhuman-related or safety applications. In this paper, we present a novel\nFairness Continual Learning approach to the semantic segmentation problem. In\nparticular, under the fairness objective, a new fairness continual learning\nframework is proposed based on class distributions. Then, a novel Prototypical\nContrastive Clustering loss is proposed to address the significant challenges\nin continual learning, i.e., catastrophic forgetting and background shift. Our\nproposed loss has also been proven as a novel, generalized learning paradigm of\nknowledge distillation commonly used in continual learning. Moreover, the\nproposed Conditional Structural Consistency loss further regularized the\nstructural constraint of the predicted segmentation. Our proposed approach has\nachieved State-of-the-Art performance on three standard scene understanding\nbenchmarks, i.e., ADE20K, Cityscapes, and Pascal VOC, and promoted the fairness\nof the segmentation model.\n","authors":["Thanh-Dat Truong","Hoang-Quan Nguyen","Bhiksha Raj","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.15700v4.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2302.05361v3","updated":"2023-10-01T17:09:16Z","published":"2023-02-10T16:21:07Z","title":"Leveraging Inpainting for Single-Image Shadow Removal","summary":"  Fully-supervised shadow removal methods achieve the best restoration\nqualities on public datasets but still generate some shadow remnants. One of\nthe reasons is the lack of large-scale shadow & shadow-free image pairs.\nUnsupervised methods can alleviate the issue but their restoration qualities\nare much lower than those of fully-supervised methods. In this work, we find\nthat pretraining shadow removal networks on the image inpainting dataset can\nreduce the shadow remnants significantly: a naive encoder-decoder network gets\ncompetitive restoration quality w.r.t. the state-of-the-art methods via only\n10% shadow & shadow-free image pairs. After analyzing networks with/without\ninpainting pre-training via the information stored in the weight (IIW), we find\nthat inpainting pretraining improves restoration quality in non-shadow regions\nand enhances the generalization ability of networks significantly.\nAdditionally, shadow removal fine-tuning enables networks to fill in the\ndetails of shadow regions. Inspired by these observations we formulate shadow\nremoval as an adaptive fusion task that takes advantage of both shadow removal\nand image inpainting. Specifically, we develop an adaptive fusion network\nconsisting of two encoders, an adaptive fusion block, and a decoder. The two\nencoders are responsible for extracting the feature from the shadow image and\nthe shadow-masked image respectively. The adaptive fusion block is responsible\nfor combining these features in an adaptive manner. Finally, the decoder\nconverts the adaptive fused features to the desired shadow-free result. The\nextensive experiments show that our method empowered with inpainting\noutperforms all state-of-the-art methods.\n","authors":["Xiaoguang Li","Qing Guo","Rabab Abdelfattah","Di Lin","Wei Feng","Ivor Tsang","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2302.05361v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06842v2","updated":"2023-10-01T17:04:35Z","published":"2023-06-12T03:28:18Z","title":"AerialFormer: Multi-resolution Transformer for Aerial Image Segmentation","summary":"  Aerial Image Segmentation is a top-down perspective semantic segmentation and\nhas several challenging characteristics such as strong imbalance in the\nforeground-background distribution, complex background, intra-class\nheterogeneity, inter-class homogeneity, and tiny objects. To handle these\nproblems, we inherit the advantages of Transformers and propose AerialFormer,\nwhich unifies Transformers at the contracting path with lightweight\nMulti-Dilated Convolutional Neural Networks (MD-CNNs) at the expanding path.\nOur AerialFormer is designed as a hierarchical structure, in which Transformer\nencoder outputs multi-scale features and MD-CNNs decoder aggregates information\nfrom the multi-scales. Thus, it takes both local and global contexts into\nconsideration to render powerful representations and high-resolution\nsegmentation. We have benchmarked AerialFormer on three common datasets\nincluding iSAID, LoveDA, and Potsdam. Comprehensive experiments and extensive\nablation studies show that our proposed AerialFormer outperforms previous\nstate-of-the-art methods with remarkable performance. Our source code will be\npublicly available upon acceptance.\n","authors":["Kashu Yamazaki","Taisei Hanyu","Minh Tran","Adrian de Luis","Roy McCann","Haitao Liao","Chase Rainwater","Meredith Adkins","Jackson Cothren","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2306.06842v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2301.02615v2","updated":"2023-10-01T16:32:23Z","published":"2023-01-05T15:11:05Z","title":"Silent Killer: A Stealthy, Clean-Label, Black-Box Backdoor Attack","summary":"  Backdoor poisoning attacks pose a well-known risk to neural networks.\nHowever, most studies have focused on lenient threat models. We introduce\nSilent Killer, a novel attack that operates in clean-label, black-box settings,\nuses a stealthy poison and trigger and outperforms existing methods. We\ninvestigate the use of universal adversarial perturbations as triggers in\nclean-label attacks, following the success of such approaches under\npoison-label settings. We analyze the success of a naive adaptation and find\nthat gradient alignment for crafting the poison is required to ensure high\nsuccess rates. We conduct thorough experiments on MNIST, CIFAR10, and a reduced\nversion of ImageNet and achieve state-of-the-art results.\n","authors":["Tzvi Lederer","Gallil Maimon","Lior Rokach"],"pdf_url":"https://arxiv.org/pdf/2301.02615v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02419v2","updated":"2023-10-01T15:23:02Z","published":"2023-04-05T12:58:33Z","title":"TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration","summary":"  We propose a novel task for generating 3D dance movements that simultaneously\nincorporate both text and music modalities. Unlike existing works that generate\ndance movements using a single modality such as music, our goal is to produce\nricher dance movements guided by the instructive information provided by the\ntext. However, the lack of paired motion data with both music and text\nmodalities limits the ability to generate dance movements that integrate both.\nTo alleviate this challenge, we propose to utilize a 3D human motion VQ-VAE to\nproject the motions of the two datasets into a latent space consisting of\nquantized vectors, which effectively mix the motion tokens from the two\ndatasets with different distributions for training. Additionally, we propose a\ncross-modal transformer to integrate text instructions into motion generation\narchitecture for generating 3D dance movements without degrading the\nperformance of music-conditioned dance generation. To better evaluate the\nquality of the generated motion, we introduce two novel metrics, namely Motion\nPrediction Distance (MPD) and Freezing Score (FS), to measure the coherence and\nfreezing percentage of the generated motion. Extensive experiments show that\nour approach can generate realistic and coherent dance movements conditioned on\nboth text and music while maintaining comparable performance with the two\nsingle modalities. Code is available at https://garfield-kh.github.io/TM2D/.\n","authors":["Kehong Gong","Dongze Lian","Heng Chang","Chuan Guo","Zihang Jiang","Xinxin Zuo","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02419v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2306.08243v3","updated":"2023-10-01T15:20:24Z","published":"2023-06-14T05:04:11Z","title":"MMASD: A Multimodal Dataset for Autism Intervention Analysis","summary":"  Autism spectrum disorder (ASD) is a developmental disorder characterized by\nsignificant social communication impairments and difficulties perceiving and\npresenting communication cues. Machine learning techniques have been broadly\nadopted to facilitate autism studies and assessments. However, computational\nmodels are primarily concentrated on specific analysis and validated on private\ndatasets in the autism community, which limits comparisons across models due to\nprivacy-preserving data sharing complications. This work presents a novel\nprivacy-preserving open-source dataset, MMASD as a MultiModal ASD benchmark\ndataset, collected from play therapy interventions of children with Autism.\nMMASD includes data from 32 children with ASD, and 1,315 data samples segmented\nfrom over 100 hours of intervention recordings. To promote public access, each\ndata sample consists of four privacy-preserving modalities of data; some of\nwhich are derived from original videos: (1) optical flow, (2) 2D skeleton, (3)\n3D skeleton, and (4) clinician ASD evaluation scores of children, e.g., ADOS\nscores. MMASD aims to assist researchers and therapists in understanding\nchildren's cognitive status, monitoring their progress during therapy, and\ncustomizing the treatment plan accordingly. It also has inspiration for\ndownstream tasks such as action quality assessment and interpersonal synchrony\nestimation. MMASD dataset can be easily accessed at\nhttps://github.com/Li-Jicheng/MMASD-A-Multimodal-Dataset-for-Autism-Intervention-Analysis.\n","authors":["Jicheng Li","Vuthea Chheang","Pinar Kullu","Eli Brignac","Zhang Guo","Kenneth E. Barner","Anjana Bhat","Roghayeh Leila Barmaki"],"pdf_url":"https://arxiv.org/pdf/2306.08243v3.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2306.16605v3","updated":"2023-10-01T14:56:37Z","published":"2023-06-29T00:12:21Z","title":"KITE: Keypoint-Conditioned Policies for Semantic Manipulation","summary":"  While natural language offers a convenient shared interface for humans and\nrobots, enabling robots to interpret and follow language commands remains a\nlongstanding challenge in manipulation. A crucial step to realizing a\nperformant instruction-following robot is achieving semantic manipulation,\nwhere a robot interprets language at different specificities, from high-level\ninstructions like \"Pick up the stuffed animal\" to more detailed inputs like\n\"Grab the left ear of the elephant.\" To tackle this, we propose Keypoints +\nInstructions to Execution (KITE), a two-step framework for semantic\nmanipulation which attends to both scene semantics (distinguishing between\ndifferent objects in a visual scene) and object semantics (precisely localizing\ndifferent parts within an object instance). KITE first grounds an input\ninstruction in a visual scene through 2D image keypoints, providing a highly\naccurate object-centric bias for downstream action inference. Provided an RGB-D\nscene observation, KITE then executes a learned keypoint-conditioned skill to\ncarry out the instruction. The combined precision of keypoints and\nparameterized skills enables fine-grained manipulation with generalization to\nscene and object variations. Empirically, we demonstrate KITE in 3 real-world\nenvironments: long-horizon 6-DoF tabletop manipulation, semantic grasping, and\na high-precision coffee-making task. In these settings, KITE achieves a 75%,\n70%, and 71% overall success rate for instruction-following, respectively. KITE\noutperforms frameworks that opt for pre-trained visual language models over\nkeypoint-based grounding, or omit skills in favor of end-to-end visuomotor\ncontrol, all while being trained from fewer or comparable amounts of\ndemonstrations. Supplementary material, datasets, code, and videos can be found\non our website: http://tinyurl.com/kite-site.\n","authors":["Priya Sundaresan","Suneel Belkhale","Dorsa Sadigh","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2306.16605v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15048v2","updated":"2023-10-01T14:50:31Z","published":"2023-09-26T16:25:57Z","title":"Class Incremental Learning via Likelihood Ratio Based Task Prediction","summary":"  Class incremental learning (CIL) is a challenging setting of continual\nlearning, which learns a series of tasks sequentially. Each task consists of a\nset of unique classes. The key feature of CIL is that no task identifier (or\ntask-id) is provided at test time for each test sample. Predicting the task-id\nfor each test sample is a challenging problem. An emerging theoretically\njustified and effective approach is to train a task-specific model for each\ntask in a shared network for all tasks based on a task-incremental learning\n(TIL) method to deal with forgetting. The model for each task in this approach\nis an out-of-distribution (OOD) detector rather than a conventional classifier.\nThe OOD detector can perform both within-task (in-distribution (IND)) class\nprediction and OOD detection. The OOD detection capability is the key for\ntask-id prediction during inference for each test sample. However, this paper\nargues that using a traditional OOD detector for task-id prediction is\nsub-optimal because additional information (e.g., the replay data and the\nlearned tasks) available in CIL can be exploited to design a better and\nprincipled method for task-id prediction. We call the new method TPLR (Task-id\nPrediction based on Likelihood Ratio}). TPLR markedly outperforms strong CIL\nbaselines.\n","authors":["Haowei Lin","Yijia Shao","Weinan Qian","Ningxin Pan","Yiduo Guo","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2309.15048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13196v2","updated":"2023-10-01T14:02:33Z","published":"2023-09-22T22:12:30Z","title":"ClusterFormer: Clustering As A Universal Visual Learner","summary":"  This paper presents CLUSTERFORMER, a universal vision model that is based on\nthe CLUSTERing paradigm with TransFORMER. It comprises two novel designs: 1.\nrecurrent cross-attention clustering, which reformulates the cross-attention\nmechanism in Transformer and enables recursive updates of cluster centers to\nfacilitate strong representation learning; and 2. feature dispatching, which\nuses the updated cluster centers to redistribute image features through\nsimilarity-based metrics, resulting in a transparent pipeline. This elegant\ndesign streamlines an explainable and transferable workflow, capable of\ntackling heterogeneous vision tasks (i.e., image classification, object\ndetection, and image segmentation) with varying levels of clustering\ngranularity (i.e., image-, box-, and pixel-level). Empirical results\ndemonstrate that CLUSTERFORMER outperforms various well-known specialized\narchitectures, achieving 83.41% top-1 acc. over ImageNet-1K for image\nclassification, 54.2% and 47.0% mAP over MS COCO for object detection and\ninstance segmentation, 52.4% mIoU over ADE20K for semantic segmentation, and\n55.8% PQ over COCO Panoptic for panoptic segmentation. For its efficacy, we\nhope our work can catalyze a paradigm shift in universal models in computer\nvision.\n","authors":["James C. Liang","Yiming Cui","Qifan Wang","Tong Geng","Wenguan Wang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.13196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06774v3","updated":"2023-10-01T13:59:25Z","published":"2022-11-13T00:09:36Z","title":"Large-Scale Bidirectional Training for Zero-Shot Image Captioning","summary":"  When trained on large-scale datasets, image captioning models can understand\nthe content of images from a general domain but often fail to generate\naccurate, detailed captions. To improve performance, pretraining-and-finetuning\nhas been a key strategy for image captioning. However, we find that large-scale\nbidirectional training between image and text enables zero-shot image\ncaptioning. In this paper, we introduce Bidirectional Image Text Training in\nlargER Scale, BITTERS, an efficient training and inference framework for\nzero-shot image captioning. We also propose a new evaluation benchmark which\ncomprises of high quality datasets and an extensive set of metrics to properly\nevaluate zero-shot captioning accuracy and societal bias. We additionally\nprovide an efficient finetuning approach for keyword extraction. We show that\ncareful selection of large-scale training set and model architecture is the key\nto achieving zero-shot image captioning.\n","authors":["Taehoon Kim","Mark Marsden","Pyunghwan Ahn","Sangyun Kim","Sihaeng Lee","Alessandra Sala","Seung Hwan Kim"],"pdf_url":"https://arxiv.org/pdf/2211.06774v3.pdf","comment":"Arxiv Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2309.03160v2","updated":"2023-10-01T13:16:36Z","published":"2023-09-06T16:59:36Z","title":"ResFields: Residual Neural Fields for Spatiotemporal Signals","summary":"  Neural fields, a category of neural networks trained to represent\nhigh-frequency signals, have gained significant attention in recent years due\nto their impressive performance in modeling complex 3D data, especially large\nneural signed distance (SDFs) or radiance fields (NeRFs) via a single\nmulti-layer perceptron (MLP). However, despite the power and simplicity of\nrepresenting signals with an MLP, these methods still face challenges when\nmodeling large and complex temporal signals due to the limited capacity of\nMLPs. In this paper, we propose an effective approach to address this\nlimitation by incorporating temporal residual layers into neural fields, dubbed\nResFields, a novel class of networks specifically designed to effectively\nrepresent complex temporal signals. We conduct a comprehensive analysis of the\nproperties of ResFields and propose a matrix factorization technique to reduce\nthe number of trainable parameters and enhance generalization capabilities.\nImportantly, our formulation seamlessly integrates with existing techniques and\nconsistently improves results across various challenging tasks: 2D video\napproximation, dynamic shape modeling via temporal SDFs, and dynamic NeRF\nreconstruction. Lastly, we demonstrate the practical utility of ResFields by\nshowcasing its effectiveness in capturing dynamic 3D scenes from sparse sensory\ninputs of a lightweight capture system.\n","authors":["Marko Mihajlovic","Sergey Prokudin","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2309.03160v2.pdf","comment":"Project page and code at https://markomih.github.io/ResFields/"},{"id":"http://arxiv.org/abs/2307.03980v2","updated":"2023-10-01T12:57:01Z","published":"2023-07-08T14:08:37Z","title":"Building and Road Segmentation Using EffUNet and Transfer Learning\n  Approach","summary":"  In city, information about urban objects such as water supply, railway lines,\npower lines, buildings, roads, etc., is necessary for city planning. In\nparticular, information about the spread of these objects, locations and\ncapacity is needed for the policymakers to make impactful decisions. This\nthesis aims to segment the building and roads from the aerial image captured by\nthe satellites and UAVs. Many different architectures have been proposed for\nthe semantic segmentation task and UNet being one of them. In this thesis, we\npropose a novel architecture based on Google's newly proposed EfficientNetV2 as\nan encoder for feature extraction with UNet decoder for constructing the\nsegmentation map. Using this approach we achieved a benchmark score for the\nMassachusetts Building and Road dataset with an mIOU of 0.8365 and 0.9153\nrespectively.\n","authors":["Sahil Gangurde"],"pdf_url":"https://arxiv.org/pdf/2307.03980v2.pdf","comment":"The transformer network analysis was not included in the current\n  paper"},{"id":"http://arxiv.org/abs/2306.15203v2","updated":"2023-10-01T11:57:40Z","published":"2023-06-27T04:50:58Z","title":"Unsupervised Polychromatic Neural Representation for CT Metal Artifact\n  Reduction","summary":"  Emerging neural reconstruction techniques based on tomography (e.g., NeRF,\nNeAT, and NeRP) have started showing unique capabilities in medical imaging. In\nthis work, we present a novel Polychromatic neural representation (Polyner) to\ntackle the challenging problem of CT imaging when metallic implants exist\nwithin the human body. CT metal artifacts arise from the drastic variation of\nmetal's attenuation coefficients at various energy levels of the X-ray\nspectrum, leading to a nonlinear metal effect in CT measurements. Recovering CT\nimages from metal-affected measurements hence poses a complicated nonlinear\ninverse problem where empirical models adopted in previous metal artifact\nreduction (MAR) approaches lead to signal loss and strongly aliased\nreconstructions. Polyner instead models the MAR problem from a nonlinear\ninverse problem perspective. Specifically, we first derive a polychromatic\nforward model to accurately simulate the nonlinear CT acquisition process.\nThen, we incorporate our forward model into the implicit neural representation\nto accomplish reconstruction. Lastly, we adopt a regularizer to preserve the\nphysical properties of the CT images across different energy levels while\neffectively constraining the solution space. Our Polyner is an unsupervised\nmethod and does not require any external training data. Experimenting with\nmultiple datasets shows that our Polyner achieves comparable or better\nperformance than supervised methods on in-domain datasets while demonstrating\nsignificant performance improvements on out-of-domain datasets. To the best of\nour knowledge, our Polyner is the first unsupervised MAR method that\noutperforms its supervised counterparts. The code for this work is available\nat: https://github.com/iwuqing/Polyner.\n","authors":["Qing Wu","Lixuan Chen","Ce Wang","Hongjiang Wei","S. Kevin Zhou","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.15203v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2303.17550v4","updated":"2023-10-01T11:20:26Z","published":"2023-03-30T17:18:31Z","title":"DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with\n  Diffusion Autoencoder","summary":"  While recent research has made significant progress in speech-driven talking\nface generation, the quality of the generated video still lags behind that of\nreal recordings. One reason for this is the use of handcrafted intermediate\nrepresentations like facial landmarks and 3DMM coefficients, which are designed\nbased on human knowledge and are insufficient to precisely describe facial\nmovements. Additionally, these methods require an external pretrained model for\nextracting these representations, whose performance sets an upper bound on\ntalking face generation. To address these limitations, we propose a novel\nmethod called DAE-Talker that leverages data-driven latent representations\nobtained from a diffusion autoencoder (DAE). DAE contains an image encoder that\nencodes an image into a latent vector and a DDIM image decoder that\nreconstructs the image from it. We train our DAE on talking face video frames\nand then extract their latent representations as the training target for a\nConformer-based speech2latent model. This allows DAE-Talker to synthesize full\nvideo frames and produce natural head movements that align with the content of\nspeech, rather than relying on a predetermined head pose from a template video.\nWe also introduce pose modelling in speech2latent for pose controllability.\nAdditionally, we propose a novel method for generating continuous video frames\nwith the DDIM image decoder trained on individual frames, eliminating the need\nfor modelling the joint distribution of consecutive frames directly. Our\nexperiments show that DAE-Talker outperforms existing popular methods in\nlip-sync, video fidelity, and pose naturalness. We also conduct ablation\nstudies to analyze the effectiveness of the proposed techniques and demonstrate\nthe pose controllability of DAE-Talker.\n","authors":["Chenpeng Du","Qi Chen","Tianyu He","Xu Tan","Xie Chen","Kai Yu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.17550v4.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.05320v2","updated":"2023-10-01T09:14:51Z","published":"2023-08-10T03:44:10Z","title":"Generating Transferable and Stealthy Adversarial Patch via\n  Attention-guided Adversarial Inpainting","summary":"  Adversarial patch attacks can fool the face recognition (FR) models via small\npatches. However, previous adversarial patch attacks often result in unnatural\npatterns that are easily noticeable. Generating transferable and stealthy\nadversarial patches that can efficiently deceive the black-box FR models while\nhaving good camouflage is challenging because of the huge stylistic difference\nbetween the source and target images. To generate transferable,\nnatural-looking, and stealthy adversarial patches, we propose an innovative\ntwo-stage attack called Adv-Inpainting, which extracts style features and\nidentity features from the attacker and target faces, respectively and then\nfills the patches with misleading and inconspicuous content guided by attention\nmaps. In the first stage, we extract multi-scale style embeddings by a\npyramid-like network and identity embeddings by a pretrained FR model and\npropose a novel Attention-guided Adaptive Instance Normalization layer (AAIN)\nto merge them via background-patch cross-attention maps. The proposed layer can\nadaptively fuse identity and style embeddings by fully exploiting priority\ncontextual information. In the second stage, we design an Adversarial Patch\nRefinement Network (APR-Net) with a novel boundary variance loss, a spatial\ndiscounted reconstruction loss, and a perceptual loss to boost the stealthiness\nfurther. Experiments demonstrate that our attack can generate adversarial\npatches with improved visual quality, better stealthiness, and stronger\ntransferability than state-of-the-art adversarial patch attacks and semantic\nattacks.\n","authors":["Yanjie Li","Mingxing Duan","Xuelong Dai","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.05320v2.pdf","comment":"Submitted to ICLR2024"},{"id":"http://arxiv.org/abs/2302.06857v2","updated":"2023-10-01T08:08:02Z","published":"2023-02-14T06:28:42Z","title":"Make Your Brief Stroke Real and Stereoscopic: 3D-Aware Simplified Sketch\n  to Portrait Generation","summary":"  Creating the photo-realistic version of people sketched portraits is useful\nto various entertainment purposes. Existing studies only generate portraits in\nthe 2D plane with fixed views, making the results less vivid. In this paper, we\npresent Stereoscopic Simplified Sketch-to-Portrait (SSSP), which explores the\npossibility of creating Stereoscopic 3D-aware portraits from simple contour\nsketches by involving 3D generative models. Our key insight is to design\nsketch-aware constraints that can fully exploit the prior knowledge of a\ntri-plane-based 3D-aware generative model. Specifically, our designed\nregion-aware volume rendering strategy and global consistency constraint\nfurther enhance detail correspondences during sketch encoding. Moreover, in\norder to facilitate the usage of layman users, we propose a Contour-to-Sketch\nmodule with vector quantized representations, so that easily drawn contours can\ndirectly guide the generation of 3D portraits. Extensive comparisons show that\nour method generates high-quality results that match the sketch. Our usability\nstudy verifies that our system is greatly preferred by user.\n","authors":["Yasheng Sun","Qianyi Wu","Hang Zhou","Kaisiyuan Wang","Tianshu Hu","Chen-Chieh Liao","Shio Miyafuji","Ziwei Liu","Hideki Koike"],"pdf_url":"https://arxiv.org/pdf/2302.06857v2.pdf","comment":"Project Page on https://hangz-nju-cuhk.github.io/projects/SSSP, Video\n  Url: https://youtu.be/GiOKbvr2U_E"},{"id":"http://arxiv.org/abs/2211.11242v2","updated":"2023-10-01T07:25:31Z","published":"2022-11-21T08:15:18Z","title":"L-MAE: Masked Autoencoders are Semantic Segmentation Datasets Augmenter","summary":"  Generating semantic segmentation datasets has consistently been laborious and\ntime-consuming, particularly in the context of large models or specialized\ndomains(i.e. Medical Imaging or Remote Sensing). Specifically, large models\nnecessitate a substantial volume of data, while datasets in professional\ndomains frequently require the involvement of domain experts. Both scenarios\nare susceptible to inaccurate data labeling, which can significantly affect the\nultimate performance of the trained model. This paper proposes a simple and\neffective label pixel-level completion method, \\textbf{Label Mask AutoEncoder}\n(L-MAE), which fully uses the existing information in the label to generate the\ncomplete label. The proposed model are the first to apply the Mask Auto-Encoder\nto downstream tasks. In detail, L-MAE adopts the fusion strategy that stacks\nthe label and the corresponding image, namely fuse map. Moreover, since some of\nthe image information is lost when masking the fuse map, direct reconstruction\nmay lead to poor performance. We proposed Image Patch Supplement algorithm to\nsupplement the missing information during the mask-reconstruct process, and\nempirically found that an average of 4.1\\% mIoU can be improved.\n  We conducted a experiment to evaluate the efficacy of L-MAE to complete the\ndataset. We employed a degraded Pascal VOC dataset and the degraded dataset\nenhanced by L-MAE to train an identical conventional semantic segmentation\nmodel for the initial set of experiments. The results of these experiments\ndemonstrate a performance enhancement of 13.5\\% in the model trained with the\nL-MAE-enhanced dataset compared to the unenhanced dataset.\n","authors":["Jiaru Jia","Mingzhe Liu","Jiake Xie","Xin Chen","Hong Zhang","Feixiang Zhao","Aiqing Yang"],"pdf_url":"https://arxiv.org/pdf/2211.11242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.15179v3","updated":"2023-10-01T05:06:56Z","published":"2022-09-30T01:59:53Z","title":"Physical Adversarial Attack meets Computer Vision: A Decade Survey","summary":"  Despite the impressive achievements of Deep Neural Networks (DNNs) in\ncomputer vision, their vulnerability to adversarial attacks remains a critical\nconcern. Extensive research has demonstrated that incorporating sophisticated\nperturbations into input images can lead to a catastrophic degradation in DNNs'\nperformance. This perplexing phenomenon not only exists in the digital space\nbut also in the physical world. Consequently, it becomes imperative to evaluate\nthe security of DNNs-based systems to ensure their safe deployment in\nreal-world scenarios, particularly in security-sensitive applications. To\nfacilitate a profound understanding of this topic, this paper presents a\ncomprehensive overview of physical adversarial attacks. Firstly, we distill\nfour general steps for launching physical adversarial attacks. Building upon\nthis foundation, we uncover the pervasive role of artifacts carrying\nadversarial perturbations in the physical world. These artifacts influence each\nstep. To denote them, we introduce a new term: adversarial medium. Then, we\ntake the first step to systematically evaluate the performance of physical\nadversarial attacks, taking the adversarial medium as a first attempt. Our\nproposed evaluation metric, hiPAA, comprises six perspectives: Effectiveness,\nStealthiness, Robustness, Practicability, Aesthetics, and Economics. We also\nprovide comparative results across task categories, together with insightful\nobservations and suggestions for future research directions.\n","authors":["Hui Wei","Hao Tang","Xuemei Jia","Zhixiang Wang","Hanxun Yu","Zhubo Li","Shin'ichi Satoh","Luc Van Gool","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2209.15179v3.pdf","comment":"19 pages. Under Review"},{"id":"http://arxiv.org/abs/2309.13549v2","updated":"2023-10-01T04:01:04Z","published":"2023-09-24T04:43:39Z","title":"Towards Robust Robot 3D Perception in Urban Environments: The UT Campus\n  Object Dataset","summary":"  We introduce the UT Campus Object Dataset (CODa), a mobile robot egocentric\nperception dataset collected on the University of Texas Austin Campus. Our\ndataset contains 8.5 hours of multimodal sensor data: synchronized 3D point\nclouds and stereo RGB video from a 128-channel 3D LiDAR and two 1.25MP RGB\ncameras at 10 fps; RGB-D videos from an additional 0.5MP sensor at 7 fps, and a\n9-DOF IMU sensor at 40 Hz. We provide 58 minutes of ground-truth annotations\ncontaining 1.3 million 3D bounding boxes with instance IDs for 53 semantic\nclasses, 5000 frames of 3D semantic annotations for urban terrain, and\npseudo-ground truth localization. We repeatedly traverse identical geographic\nlocations for a wide range of indoor and outdoor areas, weather conditions, and\ntimes of the day. Using CODa, we empirically demonstrate that: 1) 3D object\ndetection performance in urban settings is significantly higher when trained\nusing CODa compared to existing datasets even when employing state-of-the-art\ndomain adaptation approaches, 2) sensor-specific fine-tuning improves 3D object\ndetection accuracy and 3) pretraining on CODa improves cross-dataset 3D object\ndetection performance in urban settings compared to pretraining on AV datasets.\nUsing our dataset and annotations, we release benchmarks for 3D object\ndetection and 3D semantic segmentation using established metrics. In the\nfuture, the CODa benchmark will include additional tasks like unsupervised\nobject discovery and re-identification. We publicly release CODa on the Texas\nData Repository, pre-trained models, dataset development package, and\ninteractive dataset viewer on our website at https://amrl.cs.utexas.edu/coda.\nWe expect CODa to be a valuable dataset for research in egocentric 3D\nperception and planning for autonomous navigation in urban environments.\n","authors":["Arthur Zhang","Chaitanya Eranki","Christina Zhang","Ji-Hwan Park","Raymond Hong","Pranav Kalyani","Lochana Kalyanaraman","Arsh Gamare","Arnav Bagad","Maria Esteva","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2309.13549v2.pdf","comment":"19 pages, 18 figures, 12 tables"},{"id":"http://arxiv.org/abs/2305.03815v2","updated":"2023-10-01T03:13:26Z","published":"2023-05-05T19:42:39Z","title":"Persistent Homology Meets Object Unity: Object Recognition in Clutter","summary":"  Recognition of occluded objects in unseen and unstructured indoor\nenvironments is a challenging problem for mobile robots. To address this\nchallenge, we propose a new descriptor, TOPS, for point clouds generated from\ndepth images and an accompanying recognition framework, THOR, inspired by human\nreasoning. The descriptor employs a novel slicing-based approach to compute\ntopological features from filtrations of simplicial complexes using persistent\nhomology, and facilitates reasoning-based recognition using object unity. Apart\nfrom a benchmark dataset, we report performance on a new dataset, the UW Indoor\nScenes (UW-IS) Occluded dataset, curated using commodity hardware to reflect\nreal-world scenarios with different environmental conditions and degrees of\nobject occlusion. THOR outperforms state-of-the-art methods on both the\ndatasets and achieves substantially higher recognition accuracy for all the\nscenarios of the UW-IS Occluded dataset. Therefore, THOR, is a promising step\ntoward robust recognition in low-cost robots, meant for everyday use in indoor\nsettings.\n","authors":["Ekta U. Samani","Ashis G. Banerjee"],"pdf_url":"https://arxiv.org/pdf/2305.03815v2.pdf","comment":"Conditionally accepted for publication in the IEEE Transactions on\n  Robotics"},{"id":"http://arxiv.org/abs/2302.11464v3","updated":"2023-10-01T02:54:07Z","published":"2023-02-22T15:57:03Z","title":"Debiased Mapping for Full-Reference Image Quality Assessment","summary":"  Mapping images to deep feature space for comparisons has been wildly adopted\nin recent learning-based full-reference image quality assessment (FR-IQA)\nmodels. Analogous to the classical classification task, the ideal mapping space\nfor quality regression should possess both inter-class separability and\nintra-class compactness. The inter-class separability that focuses on the\ndiscrimination of images with different quality levels has been highly\nemphasized in existing models. However, the intra-class compactness that\nmaintains small objective quality variance of images with the same or\nindistinguishable quality escapes the research attention, potentially leading\nto the perception-biased measures. In this paper, we reveal that such bias is\nmainly caused by the unsuitable subspace that the features are projected and\ncompared in. To account for this, we develop the Debiased Mapping based quality\nMeasure (DMM), which relies on the orthonormal bases of deep learning features\nformed by singular value decomposition (SVD). The SVD in deep learning feature\ndomain, which overwhelmingly separates the quality variations with singular\nvalues and projection bases, facilitates the quality inference with dedicatedly\ndesigned distance measure. Experiments on different IQA databases demonstrate\nthe mapping method is able to mitigate the perception bias efficiently, and the\nsuperior performance on quality prediction verifies the effectiveness of our\nmethod. The implementation will be publicly available.\n","authors":["Baoliang Chen","Hanwei Zhu","Lingyu Zhu","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2302.11464v3.pdf","comment":"Basis Angle Consistency in Sec.3.2 will be revised"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.07321v2","updated":"2023-10-01T13:12:29Z","published":"2023-07-13T13:17:19Z","title":"NS4AR: A new, focused on sampling areas sampling method in graphical\n  recommendation Systems","summary":"  The effectiveness of graphical recommender system depends on the quantity and\nquality of negative sampling. This paper selects some typical recommender\nsystem models, as well as some latest negative sampling strategies on the\nmodels as baseline. Based on typical graphical recommender model, we divide\nsample region into assigned-n areas and use AdaSim to give different weight to\nthese areas to form positive set and negative set. Because of the volume and\nsignificance of negative items, we also proposed a subset selection model to\nnarrow the core negative samples.\n","authors":["Xiangqi Wang","Dilinuer Aishan","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2307.07321v2.pdf","comment":"None"},{"id":"http://arxiv.org/abs/2309.01026v2","updated":"2023-10-01T02:57:42Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n  Multimodal Nudging","summary":"  We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel M. Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00703v1","updated":"2023-10-01T15:47:07Z","published":"2023-10-01T15:47:07Z","title":"A Comparative Study of Training Objectives for Clarification Facet\n  Generation","summary":"  Due to the ambiguity and vagueness of a user query, it is essential to\nidentify the query facets for the clarification of user intents. Existing work\non query facet generation has achieved compelling performance by sequentially\npredicting the next facet given previously generated facets based on\npre-trained language generation models such as BART. Given a query, there are\nmainly two types of training objectives to guide the facet generation models.\nOne is to generate the default sequence of ground-truth facets, and the other\nis to enumerate all the permutations of ground-truth facets and use the\nsequence that has the minimum loss for model updates. The second is\npermutation-invariant while the first is not. In this paper, we aim to conduct\na systematic comparative study of various types of training objectives, with\ndifferent properties of not only whether it is permutation-invariant but also\nwhether it conducts sequential prediction and whether it can control the count\nof output facets. To this end, we propose another three training objectives of\ndifferent aforementioned properties. For comprehensive comparisons, besides the\ncommonly used evaluation that measures the matching with ground-truth facets,\nwe also introduce two diversity metrics to measure the diversity of the\ngenerated facets. Based on an open-domain query facet dataset, i.e., MIMICS, we\nconduct extensive analyses and show the pros and cons of each method, which\ncould shed light on model training for clarification facet generation. The code\ncan be found at \\url{https://github.com/ShiyuNee/Facet-Generation}\n","authors":["Shiyu Ni","Keping Bi","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.00703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00678v1","updated":"2023-10-01T14:09:21Z","published":"2023-10-01T14:09:21Z","title":"A General Offline Reinforcement Learning Framework for Interactive\n  Recommendation","summary":"  This paper studies the problem of learning interactive recommender systems\nfrom logged feedbacks without any exploration in online environments. We\naddress the problem by proposing a general offline reinforcement learning\nframework for recommendation, which enables maximizing cumulative user rewards\nwithout online exploration. Specifically, we first introduce a probabilistic\ngenerative model for interactive recommendation, and then propose an effective\ninference algorithm for discrete and stochastic policy learning based on logged\nfeedbacks. In order to perform offline learning more effectively, we propose\nfive approaches to minimize the distribution mismatch between the logging\npolicy and recommendation policy: support constraints, supervised\nregularization, policy constraints, dual constraints and reward extrapolation.\nWe conduct extensive experiments on two public real-world datasets,\ndemonstrating that the proposed methods can achieve superior performance over\nexisting supervised learning and reinforcement learning methods for\nrecommendation.\n","authors":["Teng Xiao","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.00678v1.pdf","comment":"AAAI2021"},{"id":"http://arxiv.org/abs/2310.00654v1","updated":"2023-10-01T12:41:38Z","published":"2023-10-01T12:41:38Z","title":"Streamlining Attack Tree Generation: A Fragment-Based Approach","summary":"  Attack graphs are a tool for analyzing security vulnerabilities that capture\ndifferent and prospective attacks on a system. As a threat modeling tool, it\nshows possible paths that an attacker can exploit to achieve a particular goal.\nHowever, due to the large number of vulnerabilities that are published on a\ndaily basis, they have the potential to rapidly expand in size. Consequently,\nthis necessitates a significant amount of resources to generate attack graphs.\nIn addition, generating composited attack models for complex systems such as\nself-adaptive or AI is very difficult due to their nature to continuously\nchange. In this paper, we present a novel fragment-based attack graph\ngeneration approach that utilizes information from publicly available\ninformation security databases. Furthermore, we also propose a domain-specific\nlanguage for attack modeling, which we employ in the proposed attack graph\ngeneration approach. Finally, we present a demonstrator example showcasing the\nattack generator's capability to replicate a verified attack chain, as\npreviously confirmed by security experts.\n","authors":["Irdin Pekaric","Markus Frick","Jubril Gbolahan Adigun","Raffaela Groner","Thomas Witte","Alexander Raschke","Michael Felderer","Matthias Tichy"],"pdf_url":"https://arxiv.org/pdf/2310.00654v1.pdf","comment":"To appear at the 57th Hawaii International Conference on Social\n  Systems (HICSS-57), Honolulu, Hawaii. 2024"},{"id":"http://arxiv.org/abs/2310.00569v1","updated":"2023-10-01T03:56:38Z","published":"2023-10-01T03:56:38Z","title":"TDCGL: Two-Level Debiased Contrastive Graph Learning for Recommendation","summary":"  knowledge graph-based recommendation methods have achieved great success in\nthe field of recommender systems. However, over-reliance on high-quality\nknowledge graphs is a bottleneck for such methods. Specifically, the\nlong-tailed distribution of entities of KG and noise issues in the real world\nwill make item-entity dependent relations deviate from reflecting true\ncharacteristics and significantly harm the performance of modeling user\npreference. Contrastive learning, as a novel method that is employed for data\naugmentation and denoising, provides inspiration to fill this research gap.\nHowever, the mainstream work only focuses on the long-tail properties of the\nnumber of items clicked, while ignoring that the long-tail properties of total\nnumber of clicks per user may also affect the performance of the recommendation\nmodel. Therefore, to tackle these problems, motivated by the Debiased\nContrastive Learning of Unsupervised Sentence Representations (DCLR), we\npropose Two-Level Debiased Contrastive Graph Learning (TDCGL) model.\nSpecifically, we design the Two-Level Debiased Contrastive Learning (TDCL) and\ndeploy it in the KG, which is conducted not only on User-Item pairs but also on\nUser-User pairs for modeling higher-order relations. Also, to reduce the bias\ncaused by random sampling in contrastive learning, with the exception of the\nnegative samples obtained by random sampling, we add a noise-based generation\nof negation to ensure spatial uniformity. Considerable experiments on\nopen-source datasets demonstrate that our method has excellent anti-noise\ncapability and significantly outperforms state-of-the-art baselines. In\naddition, ablation studies about the necessity for each level of TDCL are\nconducted.\n","authors":["Yubo Gao","Haotian Wu"],"pdf_url":"https://arxiv.org/pdf/2310.00569v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.19243v2","updated":"2023-10-01T22:36:36Z","published":"2023-05-30T17:31:25Z","title":"Unlocking Tuning-free Generalization: Minimizing the PAC-Bayes Bound\n  with Trainable Priors","summary":"  It is widely recognized that the generalization ability of neural networks\ncan be greatly enhanced through carefully designing the training procedure. The\ncurrent state-of-the-art training approach involves utilizing stochastic\ngradient descent (SGD) or Adam optimization algorithms along with a combination\nof additional regularization techniques such as weight decay, dropout, or noise\ninjection. Optimal generalization can only be achieved by tuning a multitude of\nhyperparameters through grid search, which can be time-consuming and\nnecessitates additional validation datasets. To address this issue, we\nintroduce a practical PAC-Bayes training framework that is nearly tuning-free\nand requires no additional regularization while achieving comparable testing\nperformance to that of SGD/Adam after a complete grid search and with extra\nregularizations. Our proposed algorithm demonstrates the remarkable potential\nof PAC training to achieve state-of-the-art performance on deep neural networks\nwith enhanced robustness and interpretability.\n","authors":["Xitong Zhang","Avrajit Ghosh","Guangliang Liu","Rongrong Wang"],"pdf_url":"https://arxiv.org/pdf/2305.19243v2.pdf","comment":"30 pages, 15 figures, 7 tables"},{"id":"http://arxiv.org/abs/2309.00079v2","updated":"2023-10-01T22:18:35Z","published":"2023-08-31T18:33:05Z","title":"On the Implicit Bias of Adam","summary":"  In previous literature, backward error analysis was used to find ordinary\ndifferential equations (ODEs) approximating the gradient descent trajectory. It\nwas found that finite step sizes implicitly regularize solutions because terms\nappearing in the ODEs penalize the two-norm of the loss gradients. We prove\nthat the existence of similar implicit regularization in RMSProp and Adam\ndepends on their hyperparameters and the training stage, but with a different\n\"norm\" involved: the corresponding ODE terms either penalize the (perturbed)\none-norm of the loss gradients or, on the contrary, hinder its decrease (the\nlatter case being typical). We also conduct numerical experiments and discuss\nhow the proven facts can influence generalization.\n","authors":["Matias D. Cattaneo","Jason M. Klusowski","Boris Shigida"],"pdf_url":"https://arxiv.org/pdf/2309.00079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13301v3","updated":"2023-10-01T22:07:12Z","published":"2023-05-22T17:57:41Z","title":"Training Diffusion Models with Reinforcement Learning","summary":"  Diffusion models are a class of flexible generative models trained with an\napproximation to the log-likelihood objective. However, most use cases of\ndiffusion models are not concerned with likelihoods, but instead with\ndownstream objectives such as human-perceived image quality or drug\neffectiveness. In this paper, we investigate reinforcement learning methods for\ndirectly optimizing diffusion models for such objectives. We describe how\nposing denoising as a multi-step decision-making problem enables a class of\npolicy gradient algorithms, which we refer to as denoising diffusion policy\noptimization (DDPO), that are more effective than alternative reward-weighted\nlikelihood approaches. Empirically, DDPO is able to adapt text-to-image\ndiffusion models to objectives that are difficult to express via prompting,\nsuch as image compressibility, and those derived from human feedback, such as\naesthetic quality. Finally, we show that DDPO can improve prompt-image\nalignment using feedback from a vision-language model without the need for\nadditional data collection or human annotation. The project's website can be\nfound at http://rl-diffusion.github.io .\n","authors":["Kevin Black","Michael Janner","Yilun Du","Ilya Kostrikov","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2305.13301v3.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2211.09110v2","updated":"2023-10-01T21:44:23Z","published":"2022-11-16T18:51:34Z","title":"Holistic Evaluation of Language Models","summary":"  Language models (LMs) are becoming the foundation for almost all major\nlanguage technologies, but their capabilities, limitations, and risks are not\nwell understood. We present Holistic Evaluation of Language Models (HELM) to\nimprove the transparency of language models. First, we taxonomize the vast\nspace of potential scenarios (i.e. use cases) and metrics (i.e. desiderata)\nthat are of interest for LMs. Then we select a broad subset based on coverage\nand feasibility, noting what's missing or underrepresented (e.g. question\nanswering for neglected English dialects, metrics for trustworthiness). Second,\nwe adopt a multi-metric approach: We measure 7 metrics (accuracy, calibration,\nrobustness, fairness, bias, toxicity, and efficiency) for each of 16 core\nscenarios when possible (87.5% of the time). This ensures metrics beyond\naccuracy don't fall to the wayside, and that trade-offs are clearly exposed. We\nalso perform 7 targeted evaluations, based on 26 targeted scenarios, to analyze\nspecific aspects (e.g. reasoning, disinformation). Third, we conduct a\nlarge-scale evaluation of 30 prominent language models (spanning open,\nlimited-access, and closed models) on all 42 scenarios, 21 of which were not\npreviously used in mainstream LM evaluation. Prior to HELM, models on average\nwere evaluated on just 17.9% of the core HELM scenarios, with some prominent\nmodels not sharing a single scenario in common. We improve this to 96.0%: now\nall 30 models have been densely benchmarked on the same core scenarios and\nmetrics under standardized conditions. Our evaluation surfaces 25 top-level\nfindings. For full transparency, we release all raw model prompts and\ncompletions publicly for further analysis, as well as a general modular\ntoolkit. We intend for HELM to be a living benchmark for the community,\ncontinuously updated with new scenarios, metrics, and models.\n","authors":["Percy Liang","Rishi Bommasani","Tony Lee","Dimitris Tsipras","Dilara Soylu","Michihiro Yasunaga","Yian Zhang","Deepak Narayanan","Yuhuai Wu","Ananya Kumar","Benjamin Newman","Binhang Yuan","Bobby Yan","Ce Zhang","Christian Cosgrove","Christopher D. Manning","Christopher Ré","Diana Acosta-Navas","Drew A. Hudson","Eric Zelikman","Esin Durmus","Faisal Ladhak","Frieda Rong","Hongyu Ren","Huaxiu Yao","Jue Wang","Keshav Santhanam","Laurel Orr","Lucia Zheng","Mert Yuksekgonul","Mirac Suzgun","Nathan Kim","Neel Guha","Niladri Chatterji","Omar Khattab","Peter Henderson","Qian Huang","Ryan Chi","Sang Michael Xie","Shibani Santurkar","Surya Ganguli","Tatsunori Hashimoto","Thomas Icard","Tianyi Zhang","Vishrav Chaudhary","William Wang","Xuechen Li","Yifan Mai","Yuhui Zhang","Yuta Koreeda"],"pdf_url":"https://arxiv.org/pdf/2211.09110v2.pdf","comment":"Authored by the Center for Research on Foundation Models (CRFM) at\n  the Stanford Institute for Human-Centered Artificial Intelligence (HAI).\n  Project page: https://crfm.stanford.edu/helm/v1.0"},{"id":"http://arxiv.org/abs/2304.01230v2","updated":"2023-10-01T21:35:07Z","published":"2023-04-02T15:57:09Z","title":"SEENN: Towards Temporal Spiking Early-Exit Neural Networks","summary":"  Spiking Neural Networks (SNNs) have recently become more popular as a\nbiologically plausible substitute for traditional Artificial Neural Networks\n(ANNs). SNNs are cost-efficient and deployment-friendly because they process\ninput in both spatial and temporal manner using binary spikes. However, we\nobserve that the information capacity in SNNs is affected by the number of\ntimesteps, leading to an accuracy-efficiency tradeoff. In this work, we study a\nfine-grained adjustment of the number of timesteps in SNNs. Specifically, we\ntreat the number of timesteps as a variable conditioned on different input\nsamples to reduce redundant timesteps for certain data. We call our method\nSpiking Early-Exit Neural Networks (SEENNs). To determine the appropriate\nnumber of timesteps, we propose SEENN-I which uses a confidence score\nthresholding to filter out the uncertain predictions, and SEENN-II which\ndetermines the number of timesteps by reinforcement learning. Moreover, we\ndemonstrate that SEENN is compatible with both the directly trained SNN and the\nANN-SNN conversion. By dynamically adjusting the number of timesteps, our SEENN\nachieves a remarkable reduction in the average number of timesteps during\ninference. For example, our SEENN-II ResNet-19 can achieve 96.1% accuracy with\nan average of 1.08 timesteps on the CIFAR-10 test dataset. Code is shared at\nhttps://github.com/Intelligent-Computing-Lab-Yale/SEENN.\n","authors":["Yuhang Li","Tamar Geller","Youngeun Kim","Priyadarshini Panda"],"pdf_url":"https://arxiv.org/pdf/2304.01230v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2212.08108v3","updated":"2023-10-01T20:48:26Z","published":"2022-12-15T19:49:27Z","title":"Dataflow Analysis-Inspired Deep Learning for Efficient Vulnerability\n  Detection","summary":"  Deep learning-based vulnerability detection has shown great performance and,\nin some studies, outperformed static analysis tools. However, the\nhighest-performing approaches use token-based transformer models, which are not\nthe most efficient to capture code semantics required for vulnerability\ndetection. Classical program analysis techniques such as dataflow analysis can\ndetect many types of bugs based on their root causes. In this paper, we propose\nto combine such causal-based vulnerability detection algorithms with deep\nlearning, aiming to achieve more efficient and effective vulnerability\ndetection. Specifically, we designed DeepDFA, a dataflow analysis-inspired\ngraph learning framework and an embedding technique that enables graph learning\nto simulate dataflow computation. We show that DeepDFA is both performant and\nefficient. DeepDFA outperformed all non-transformer baselines. It was trained\nin 9 minutes, 75x faster than the highest-performing baseline model. When using\nonly 50+ vulnerable and several hundreds of total examples as training data,\nthe model retained the same performance as 100% of the dataset. DeepDFA also\ngeneralized to real-world vulnerabilities in DbgBench; it detected 8.7 out of\n17 vulnerabilities on average across folds and was able to distinguish between\npatched and buggy versions, while the highest-performing baseline models did\nnot detect any vulnerabilities. By combining DeepDFA with a large language\nmodel, we surpassed the state-of-the-art vulnerability detection performance on\nthe Big-Vul dataset with 96.46 F1 score, 97.82 precision, and 95.14 recall. Our\nreplication package is located at https://doi.org/10.6084/m9.figshare.21225413 .\n","authors":["Benjamin Steenhoek","Hongyang Gao","Wei Le"],"pdf_url":"https://arxiv.org/pdf/2212.08108v3.pdf","comment":"Accepted at ICSE 2024 (Early Cycle). Camera-ready version"},{"id":"http://arxiv.org/abs/2308.00180v2","updated":"2023-10-01T20:40:52Z","published":"2023-07-31T22:29:16Z","title":"General Anomaly Detection of Underwater Gliders Validated by Large-scale\n  Deployment Datasets","summary":"  Underwater gliders have been widely used in oceanography for a range of\napplications. However, unpredictable events like shark strike or remora\nattachment can lead to abnormal glider behavior or even loss of the glider.\nThis paper employs an anomaly detection algorithm to assess operational\nconditions of underwater gliders in the ocean environment. Prompt alerts are\nprovided to glider pilots upon detecting any anomaly, so that they can take\ncontrol of the glider to prevent further harm. The detection algorithm is\napplied to abundant datasets collected in real glider deployments led by the\nSkidaway Institute of Oceanography (SkIO) in the University of Georgia and the\nUniversity of South Florida (USF). In order to demonstrate generality, the\nexperimental evaluation is applied to four glider deployment datasets.\nSpecifically, we utilize post-recovery DBD datasets carrying high-resolution\ninformation to perform detailed analysis of the anomaly and compare it with\npilot logs. Additionally, we implement the online detection based on the\nreal-time subsets of data transmitted from the glider at the surfacing events.\nWhile the real-time glider data may not contain as much rich information as the\npost-recovery one, the online detection is of great importance as it allows\nglider pilots to monitor potential abnormal conditions in real time.\n","authors":["Ruochu Yang","Chad Lembke","Fumin Zhang","Catherine Edwards"],"pdf_url":"https://arxiv.org/pdf/2308.00180v2.pdf","comment":"Accepted in IEEE/MTS OCEANS Gulf Coast 2023"},{"id":"http://arxiv.org/abs/2307.09912v2","updated":"2023-10-01T20:15:28Z","published":"2023-07-19T11:32:24Z","title":"Learning invariant representations of time-homogeneous stochastic\n  dynamical systems","summary":"  We consider the general class of time-homogeneous stochastic dynamical\nsystems, both discrete and continuous, and study the problem of learning a\nrepresentation of the state that faithfully captures its dynamics. This is\ninstrumental to learn the transfer operator of the system, that in turn can be\nused for numerous tasks, such as forecasting and interpreting the system\ndynamics. We show that the search for a good representation can be cast as an\noptimization problem over neural networks. Our approach is supported by recent\nresults in statistical learning theory, highlighting the role of approximation\nerror and metric distortion in the context of transfer operator regression. The\nobjective function we propose is associated with projection operators from the\nrepresentation space to the data space, overcomes metric distortion, and can be\nempirically estimated from data. In the discrete time setting, we further\nderive a relaxed objective function that is differentiable and numerically\nwell-conditioned. We compare our method against state-of-the-art approaches on\ndifferent datasets, showing better performance across the board.\n","authors":["Vladimir R. Kostic","Pietro Novelli","Riccardo Grazzi","Karim Lounici","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2307.09912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08553v2","updated":"2023-10-01T20:06:26Z","published":"2023-06-14T14:58:36Z","title":"Noise Stability Optimization for Flat Minima with Tight Rates","summary":"  Generalization properties are a central aspect of the design and analysis of\nlearning algorithms. One notion that has been considered in many previous works\nas leading to good generalization is flat minima, which informally describes a\nloss surface that is insensitive to noise perturbations. However, the design of\nefficient algorithms (that are easy to analyze) to find them is relatively\nunder-explored. In this paper, we propose a new algorithm to address this\nissue, which minimizes a stochastic optimization objective that averages noise\nperturbations injected into the weights of a function. This algorithm is shown\nto enjoy both theoretical and empirical advantages compared to existing\nalgorithms involving worst-case perturbations. Theoretically, we show tight\nconvergence rates of our algorithm to find first-order stationary points of the\nstochastic objective. Empirically, the algorithm induces a penalty on the trace\nof the Hessian, leading to iterates that are flatter than SGD and other\nalternatives, with tighter generalization gaps. Altogether, this work\ncontributes a provable and practical algorithm to find flat minima by\noptimizing the noise stability properties of a function.\n","authors":["Haotian Ju","Dongyue Li","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.08553v2.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2305.15352v2","updated":"2023-10-01T19:40:22Z","published":"2023-05-24T17:02:30Z","title":"Optimal Rates for Bandit Nonstochastic Control","summary":"  Linear Quadratic Regulator (LQR) and Linear Quadratic Gaussian (LQG) control\nare foundational and extensively researched problems in optimal control. We\ninvestigate LQR and LQG problems with semi-adversarial perturbations and\ntime-varying adversarial bandit loss functions. The best-known sublinear regret\nalgorithm of~\\cite{gradu2020non} has a $T^{\\frac{3}{4}}$ time horizon\ndependence, and its authors posed an open question about whether a tight rate\nof $\\sqrt{T}$ could be achieved. We answer in the affirmative, giving an\nalgorithm for bandit LQR and LQG which attains optimal regret (up to\nlogarithmic factors) for both known and unknown systems. A central component of\nour method is a new scheme for bandit convex optimization with memory, which is\nof independent interest.\n","authors":["Y. Jennifer Sun","Stephen Newman","Elad Hazan"],"pdf_url":"https://arxiv.org/pdf/2305.15352v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.09658v4","updated":"2023-10-01T19:19:03Z","published":"2021-09-20T16:22:49Z","title":"FUTURE-AI: Guiding Principles and Consensus Recommendations for\n  Trustworthy Artificial Intelligence in Medical Imaging","summary":"  The recent advancements in artificial intelligence (AI) combined with the\nextensive amount of data generated by today's clinical systems, has led to the\ndevelopment of imaging AI solutions across the whole value chain of medical\nimaging, including image reconstruction, medical image segmentation,\nimage-based diagnosis and treatment planning. Notwithstanding the successes and\nfuture potential of AI in medical imaging, many stakeholders are concerned of\nthe potential risks and ethical implications of imaging AI solutions, which are\nperceived as complex, opaque, and difficult to comprehend, utilise, and trust\nin critical clinical applications. Despite these concerns and risks, there are\ncurrently no concrete guidelines and best practices for guiding future AI\ndevelopments in medical imaging towards increased trust, safety and adoption.\nTo bridge this gap, this paper introduces a careful selection of guiding\nprinciples drawn from the accumulated experiences, consensus, and best\npractices from five large European projects on AI in Health Imaging. These\nguiding principles are named FUTURE-AI and its building blocks consist of (i)\nFairness, (ii) Universality, (iii) Traceability, (iv) Usability, (v) Robustness\nand (vi) Explainability. In a step-by-step approach, these guidelines are\nfurther translated into a framework of concrete recommendations for specifying,\ndeveloping, evaluating, and deploying technically, clinically and ethically\ntrustworthy AI solutions into clinical practice.\n","authors":["Karim Lekadir","Richard Osuala","Catherine Gallin","Noussair Lazrak","Kaisar Kushibar","Gianna Tsakou","Susanna Aussó","Leonor Cerdá Alberich","Kostas Marias","Manolis Tsiknakis","Sara Colantonio","Nickolas Papanikolaou","Zohaib Salahuddin","Henry C Woodruff","Philippe Lambin","Luis Martí-Bonmatí"],"pdf_url":"https://arxiv.org/pdf/2109.09658v4.pdf","comment":"Please refer to arXiv:2309.12325 for the latest FUTURE-AI framework\n  for healthcare"},{"id":"http://arxiv.org/abs/2306.11886v2","updated":"2023-10-01T18:56:37Z","published":"2023-06-20T20:59:10Z","title":"SPRINT: Scalable Policy Pre-Training via Language Instruction Relabeling","summary":"  Pre-training robot policies with a rich set of skills can substantially\naccelerate the learning of downstream tasks. Prior works have defined\npre-training tasks via natural language instructions, but doing so requires\ntedious human annotation of hundreds of thousands of instructions. Thus, we\npropose SPRINT, a scalable offline policy pre-training approach which\nsubstantially reduces the human effort needed for pre-training a diverse set of\nskills. Our method uses two core ideas to automatically expand a base set of\npre-training tasks: instruction relabeling via large language models and\ncross-trajectory skill chaining through offline reinforcement learning. As a\nresult, SPRINT pre-training equips robots with a much richer repertoire of\nskills. Experimental results in a household simulator and on a real robot\nkitchen manipulation task show that SPRINT leads to substantially faster\nlearning of new long-horizon tasks than previous pre-training approaches.\nWebsite at https://clvrai.com/sprint.\n","authors":["Jesse Zhang","Karl Pertsch","Jiahui Zhang","Joseph J. Lim"],"pdf_url":"https://arxiv.org/pdf/2306.11886v2.pdf","comment":"29 pages, 18 figures"},{"id":"http://arxiv.org/abs/2004.08249v3","updated":"2023-10-01T18:34:20Z","published":"2020-04-17T13:59:07Z","title":"Understanding the Difficulty of Training Transformers","summary":"  Transformers have proved effective in many NLP tasks. However, their training\nrequires non-trivial efforts regarding designing cutting-edge optimizers and\nlearning rate schedulers carefully (e.g., conventional SGD fails to train\nTransformers effectively). Our objective here is to understand $\\textit{what\ncomplicates Transformer training}$ from both empirical and theoretical\nperspectives. Our analysis reveals that unbalanced gradients are not the root\ncause of the instability of training. Instead, we identify an amplification\neffect that influences training substantially -- for each layer in a\nmulti-layer Transformer model, heavy dependency on its residual branch makes\ntraining unstable, since it amplifies small parameter perturbations (e.g.,\nparameter updates) and results in significant disturbances in the model output.\nYet we observe that a light dependency limits the model potential and leads to\ninferior trained models. Inspired by our analysis, we propose Admin\n($\\textbf{Ad}$aptive $\\textbf{m}$odel $\\textbf{in}$itialization) to stabilize\nstabilize the early stage's training and unleash its full potential in the late\nstage. Extensive experiments show that Admin is more stable, converges faster,\nand leads to better performance. Implementations are released at:\nhttps://github.com/LiyuanLucasLiu/Transforemr-Clinic.\n","authors":["Liyuan Liu","Xiaodong Liu","Jianfeng Gao","Weizhu Chen","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2004.08249v3.pdf","comment":"EMNLP 2020"},{"id":"http://arxiv.org/abs/2306.09251v2","updated":"2023-10-01T17:51:23Z","published":"2023-06-15T16:30:08Z","title":"Towards Faster Non-Asymptotic Convergence for Diffusion-Based Generative\n  Models","summary":"  Diffusion models, which convert noise into new data instances by learning to\nreverse a Markov diffusion process, have become a cornerstone in contemporary\ngenerative modeling. While their practical power has now been widely\nrecognized, the theoretical underpinnings remain far from mature. In this work,\nwe develop a suite of non-asymptotic theory towards understanding the data\ngeneration process of diffusion models in discrete time, assuming access to\n$\\ell_2$-accurate estimates of the (Stein) score functions. For a popular\ndeterministic sampler (based on the probability flow ODE), we establish a\nconvergence rate proportional to $1/T$ (with $T$ the total number of steps),\nimproving upon past results; for another mainstream stochastic sampler (i.e., a\ntype of the denoising diffusion probabilistic model), we derive a convergence\nrate proportional to $1/\\sqrt{T}$, matching the state-of-the-art theory.\nImposing only minimal assumptions on the target data distribution (e.g., no\nsmoothness assumption is imposed), our results characterize how $\\ell_2$ score\nestimation errors affect the quality of the data generation processes. In\ncontrast to prior works, our theory is developed based on an elementary yet\nversatile non-asymptotic approach without resorting to toolboxes for SDEs and\nODEs. Further, we design two accelerated variants, improving the convergence to\n$1/T^2$ for the ODE-based sampler and $1/T$ for the DDPM-type sampler, which\nmight be of independent theoretical and empirical interest.\n","authors":["Gen Li","Yuting Wei","Yuxin Chen","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2306.09251v2.pdf","comment":"Score estimation errors are included in the convergence theory in the\n  new version"},{"id":"http://arxiv.org/abs/2301.11147v2","updated":"2023-10-01T17:13:42Z","published":"2023-01-26T14:54:39Z","title":"Train Hard, Fight Easy: Robust Meta Reinforcement Learning","summary":"  A major challenge of reinforcement learning (RL) in real-world applications\nis the variation between environments, tasks or clients. Meta-RL (MRL)\naddresses this issue by learning a meta-policy that adapts to new tasks.\nStandard MRL methods optimize the average return over tasks, but often suffer\nfrom poor results in tasks of high risk or difficulty. This limits system\nreliability since test tasks are not known in advance. In this work, we define\na robust MRL objective with a controlled robustness level. Optimization of\nanalogous robust objectives in RL is known to lead to both *biased gradients*\nand *data inefficiency*. We prove that the gradient bias disappears in our\nproposed MRL framework. The data inefficiency is addressed via the novel Robust\nMeta RL algorithm (RoML). RoML is a meta-algorithm that generates a robust\nversion of any given MRL algorithm, by identifying and over-sampling harder\ntasks throughout training. We demonstrate that RoML achieves robust returns on\nmultiple navigation and continuous control benchmarks.\n","authors":["Ido Greenberg","Shie Mannor","Gal Chechik","Eli Meirom"],"pdf_url":"https://arxiv.org/pdf/2301.11147v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.06599v2","updated":"2023-10-01T16:47:45Z","published":"2023-06-11T06:27:06Z","title":"Variational Imbalanced Regression: Fair Uncertainty Quantification via\n  Probabilistic Smoothing","summary":"  Existing regression models tend to fall short in both accuracy and\nuncertainty estimation when the label distribution is imbalanced. In this\npaper, we propose a probabilistic deep learning model, dubbed variational\nimbalanced regression (VIR), which not only performs well in imbalanced\nregression but naturally produces reasonable uncertainty estimation as a\nbyproduct. Different from typical variational autoencoders assuming I.I.D.\nrepresentations (a data point's representation is not directly affected by\nother data points), our VIR borrows data with similar regression labels to\ncompute the latent representation's variational distribution; furthermore,\ndifferent from deterministic regression models producing point estimates, VIR\npredicts the entire normal-inverse-gamma distributions and modulates the\nassociated conjugate distributions to impose probabilistic reweighting on the\nimbalanced data, thereby providing better uncertainty estimation. Experiments\nin several real-world datasets show that our VIR can outperform\nstate-of-the-art imbalanced regression models in terms of both accuracy and\nuncertainty estimation. Code will soon be available at\n\\url{https://github.com/Wang-ML-Lab/variational-imbalanced-regression}.\n","authors":["Ziyan Wang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06686v2","updated":"2023-10-01T16:41:51Z","published":"2023-04-13T17:34:44Z","title":"OKRidge: Scalable Optimal k-Sparse Ridge Regression","summary":"  We consider an important problem in scientific discovery, namely identifying\nsparse governing equations for nonlinear dynamical systems. This involves\nsolving sparse ridge regression problems to provable optimality in order to\ndetermine which terms drive the underlying dynamics. We propose a fast\nalgorithm, OKRidge, for sparse ridge regression, using a novel lower bound\ncalculation involving, first, a saddle point formulation, and from there,\neither solving (i) a linear system or (ii) using an ADMM-based approach, where\nthe proximal operators can be efficiently evaluated by solving another linear\nsystem and an isotonic regression problem. We also propose a method to\nwarm-start our solver, which leverages a beam search. Experimentally, our\nmethods attain provable optimality with run times that are orders of magnitude\nfaster than those of the existing MIP formulations solved by the commercial\nsolver Gurobi.\n","authors":["Jiachang Liu","Sam Rosen","Chudi Zhong","Cynthia Rudin"],"pdf_url":"https://arxiv.org/pdf/2304.06686v2.pdf","comment":"NeurIPS 2023, pre camera ready"},{"id":"http://arxiv.org/abs/2301.02615v2","updated":"2023-10-01T16:32:23Z","published":"2023-01-05T15:11:05Z","title":"Silent Killer: A Stealthy, Clean-Label, Black-Box Backdoor Attack","summary":"  Backdoor poisoning attacks pose a well-known risk to neural networks.\nHowever, most studies have focused on lenient threat models. We introduce\nSilent Killer, a novel attack that operates in clean-label, black-box settings,\nuses a stealthy poison and trigger and outperforms existing methods. We\ninvestigate the use of universal adversarial perturbations as triggers in\nclean-label attacks, following the success of such approaches under\npoison-label settings. We analyze the success of a naive adaptation and find\nthat gradient alignment for crafting the poison is required to ensure high\nsuccess rates. We conduct thorough experiments on MNIST, CIFAR10, and a reduced\nversion of ImageNet and achieve state-of-the-art results.\n","authors":["Tzvi Lederer","Gallil Maimon","Lior Rokach"],"pdf_url":"https://arxiv.org/pdf/2301.02615v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07562v2","updated":"2023-10-01T16:18:46Z","published":"2022-06-15T14:24:59Z","title":"Federated Learning with Uncertainty via Distilled Predictive\n  Distributions","summary":"  Most existing federated learning methods are unable to estimate\nmodel/predictive uncertainty since the client models are trained using the\nstandard loss function minimization approach which ignores such uncertainties.\nIn many situations, however, especially in limited data settings, it is\nbeneficial to take into account the uncertainty in the model parameters at each\nclient as it leads to more accurate predictions and also because reliable\nestimates of uncertainty can be used for tasks, such as out-of-distribution\n(OOD) detection, and sequential decision-making tasks, such as active learning.\nWe present a framework for federated learning with uncertainty where, in each\nround, each client infers the posterior distribution over its parameters as\nwell as the posterior predictive distribution (PPD), distills the PPD into a\nsingle deep neural network, and sends this network to the server. Unlike some\nof the recent Bayesian approaches to federated learning, our approach does not\nrequire sending the whole posterior distribution of the parameters from each\nclient to the server but only the PPD in the distilled form as a deep neural\nnetwork. In addition, when making predictions at test time, it does not require\ncomputationally expensive Monte-Carlo averaging over the posterior distribution\nbecause our approach always maintains the PPD in the form of a single deep\nneural network. Moreover, our approach does not make any restrictive\nassumptions, such as the form of the clients' posterior distributions, or of\ntheir PPDs. We evaluate our approach on classification in federated setting, as\nwell as active learning and OOD detection in federated settings, on which our\napproach outperforms various existing federated learning baselines.\n","authors":["Shrey Bhatt","Aishwarya Gupta","Piyush Rai"],"pdf_url":"https://arxiv.org/pdf/2206.07562v2.pdf","comment":"Accepted at ACML 2023; 21 pages(14 pages of main content, 2 pages of\n  references, and 5 pages of supplementary content)"},{"id":"http://arxiv.org/abs/2203.12114v2","updated":"2023-10-01T15:54:32Z","published":"2022-03-23T00:59:35Z","title":"An Optical Control Environment for Benchmarking Reinforcement Learning\n  Algorithms","summary":"  Deep reinforcement learning has the potential to address various scientific\nproblems. In this paper, we implement an optics simulation environment for\nreinforcement learning based controllers. The environment captures the essence\nof nonconvexity, nonlinearity, and time-dependent noise inherent in optical\nsystems, offering a more realistic setting. Subsequently, we provide the\nbenchmark results of several reinforcement learning algorithms on the proposed\nsimulation environment. The experimental findings demonstrate the superiority\nof off-policy reinforcement learning approaches over traditional control\nalgorithms in navigating the intricacies of complex optical control\nenvironments. The code of the paper is available at\nhttps://github.com/Walleclipse/Reinforcement-Learning-Pulse-Stacking.\n","authors":["Abulikemu Abuduweili","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2203.12114v2.pdf","comment":"Transactions on Machine Learning Research (2023)"},{"id":"http://arxiv.org/abs/2308.07939v2","updated":"2023-10-01T15:43:45Z","published":"2023-08-14T12:17:11Z","title":"Ada-QPacknet -- adaptive pruning with bit width reduction as an\n  efficient continual learning method without forgetting","summary":"  Continual Learning (CL) is a process in which there is still huge gap between\nhuman and deep learning model efficiency. Recently, many CL algorithms were\ndesigned. Most of them have many problems with learning in dynamic and complex\nenvironments. In this work new architecture based approach Ada-QPacknet is\ndescribed. It incorporates the pruning for extracting the sub-network for each\ntask. The crucial aspect in architecture based CL methods is theirs capacity.\nIn presented method the size of the model is reduced by efficient linear and\nnonlinear quantisation approach. The method reduces the bit-width of the\nweights format. The presented results shows that low bit quantisation achieves\nsimilar accuracy as floating-point sub-network on a well-know CL scenarios. To\nour knowledge it is the first CL strategy which incorporates both compression\ntechniques pruning and quantisation for generating task sub-networks. The\npresented algorithm was tested on well-known episode combinations and compared\nwith most popular algorithms. Results show that proposed approach outperforms\nmost of the CL strategies in task and class incremental scenarios.\n","authors":["Marcin Pietroń","Dominik Żurek","Kamil Faber","Roberto Corizzo"],"pdf_url":"https://arxiv.org/pdf/2308.07939v2.pdf","comment":"Paper accepted at ECAI 2023"},{"id":"http://arxiv.org/abs/2306.08243v3","updated":"2023-10-01T15:20:24Z","published":"2023-06-14T05:04:11Z","title":"MMASD: A Multimodal Dataset for Autism Intervention Analysis","summary":"  Autism spectrum disorder (ASD) is a developmental disorder characterized by\nsignificant social communication impairments and difficulties perceiving and\npresenting communication cues. Machine learning techniques have been broadly\nadopted to facilitate autism studies and assessments. However, computational\nmodels are primarily concentrated on specific analysis and validated on private\ndatasets in the autism community, which limits comparisons across models due to\nprivacy-preserving data sharing complications. This work presents a novel\nprivacy-preserving open-source dataset, MMASD as a MultiModal ASD benchmark\ndataset, collected from play therapy interventions of children with Autism.\nMMASD includes data from 32 children with ASD, and 1,315 data samples segmented\nfrom over 100 hours of intervention recordings. To promote public access, each\ndata sample consists of four privacy-preserving modalities of data; some of\nwhich are derived from original videos: (1) optical flow, (2) 2D skeleton, (3)\n3D skeleton, and (4) clinician ASD evaluation scores of children, e.g., ADOS\nscores. MMASD aims to assist researchers and therapists in understanding\nchildren's cognitive status, monitoring their progress during therapy, and\ncustomizing the treatment plan accordingly. It also has inspiration for\ndownstream tasks such as action quality assessment and interpersonal synchrony\nestimation. MMASD dataset can be easily accessed at\nhttps://github.com/Li-Jicheng/MMASD-A-Multimodal-Dataset-for-Autism-Intervention-Analysis.\n","authors":["Jicheng Li","Vuthea Chheang","Pinar Kullu","Eli Brignac","Zhang Guo","Kenneth E. Barner","Anjana Bhat","Roghayeh Leila Barmaki"],"pdf_url":"https://arxiv.org/pdf/2306.08243v3.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2306.06040v2","updated":"2023-10-01T15:14:35Z","published":"2023-06-09T17:05:53Z","title":"Reconstructing Human Expressiveness in Piano Performances with a\n  Transformer Network","summary":"  Capturing intricate and subtle variations in human expressiveness in music\nperformance using computational approaches is challenging. In this paper, we\npropose a novel approach for reconstructing human expressiveness in piano\nperformance with a multi-layer bi-directional Transformer encoder. To address\nthe needs for large amounts of accurately captured and score-aligned\nperformance data in training neural networks, we use transcribed scores\nobtained from an existing transcription model to train our model. We integrate\npianist identities to control the sampling process and explore the ability of\nour system to model variations in expressiveness for different pianists. The\nsystem is evaluated through statistical analysis of generated expressive\nperformances and a listening test. Overall, the results suggest that our method\nachieves state-of-the-art in generating human-like piano performances from\ntranscribed scores, while fully and consistently reconstructing human\nexpressiveness poses further challenges.\n","authors":["Jingjing Tang","Geraint Wiggins","Gyorgy Fazekas"],"pdf_url":"https://arxiv.org/pdf/2306.06040v2.pdf","comment":"12 pages, 5 figures, accepted by CMMR2023, the 16th International\n  Symposium on Computer Music Multidisciplinary Research"},{"id":"http://arxiv.org/abs/2306.07856v2","updated":"2023-10-01T15:01:45Z","published":"2023-06-13T15:35:01Z","title":"DreamDecompiler: Bayesian Program Learning by Decompiling Amortised\n  Knowledge","summary":"  Solving program induction problems requires searching through an enormous\nspace of possibilities. DreamCoder is an inductive program synthesis system\nthat, whilst solving problems, learns to simplify search in an iterative\nwake-sleep procedure. The cost of search is amortised by training a neural\nsearch policy, reducing search breadth and effectively \"compiling\" useful\ninformation to compose program solutions across tasks. Additionally, a library\nof program components is learnt to express discovered solutions in fewer\ncomponents, reducing search depth. In DreamCoder, the neural search policy has\nonly an indirect effect on the library learnt through the program solutions it\nhelps discover. We present an approach for library learning that directly\nleverages the neural search policy, effectively \"decompiling\" its amortised\nknowledge to extract relevant program components. This provides stronger\namortised inference: the amortised knowledge learnt to reduce search breadth is\nnow also used to reduce search depth. We integrate our approach with DreamCoder\nand demonstrate faster domain proficiency with improved generalisation on a\nrange of domains, particularly when fewer example solutions are available.\n","authors":["Alessandro B. Palmarini","Christopher G. Lucas","N. Siddharth"],"pdf_url":"https://arxiv.org/pdf/2306.07856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15048v2","updated":"2023-10-01T14:50:31Z","published":"2023-09-26T16:25:57Z","title":"Class Incremental Learning via Likelihood Ratio Based Task Prediction","summary":"  Class incremental learning (CIL) is a challenging setting of continual\nlearning, which learns a series of tasks sequentially. Each task consists of a\nset of unique classes. The key feature of CIL is that no task identifier (or\ntask-id) is provided at test time for each test sample. Predicting the task-id\nfor each test sample is a challenging problem. An emerging theoretically\njustified and effective approach is to train a task-specific model for each\ntask in a shared network for all tasks based on a task-incremental learning\n(TIL) method to deal with forgetting. The model for each task in this approach\nis an out-of-distribution (OOD) detector rather than a conventional classifier.\nThe OOD detector can perform both within-task (in-distribution (IND)) class\nprediction and OOD detection. The OOD detection capability is the key for\ntask-id prediction during inference for each test sample. However, this paper\nargues that using a traditional OOD detector for task-id prediction is\nsub-optimal because additional information (e.g., the replay data and the\nlearned tasks) available in CIL can be exploited to design a better and\nprincipled method for task-id prediction. We call the new method TPLR (Task-id\nPrediction based on Likelihood Ratio}). TPLR markedly outperforms strong CIL\nbaselines.\n","authors":["Haowei Lin","Yijia Shao","Weinan Qian","Ningxin Pan","Yiduo Guo","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2309.15048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09454v2","updated":"2023-10-01T13:45:23Z","published":"2023-09-18T03:28:48Z","title":"Asymptotically Efficient Online Learning for Censored Regression Models\n  Under Non-I.I.D Data","summary":"  The asymptotically efficient online learning problem is investigated for\nstochastic censored regression models, which arise from various fields of\nlearning and statistics but up to now still lacks comprehensive theoretical\nstudies on the efficiency of the learning algorithms. For this, we propose a\ntwo-step online algorithm, where the first step focuses on achieving algorithm\nconvergence, and the second step is dedicated to improving the estimation\nperformance. Under a general excitation condition on the data, we show that our\nalgorithm is strongly consistent and asymptotically normal by employing the\nstochastic Lyapunov function method and limit theories for martingales.\nMoreover, we show that the covariances of the estimates can achieve the\nCramer-Rao (C-R) bound asymptotically, indicating that the performance of the\nproposed algorithm is the best possible that one can expect in general. Unlike\nmost of the existing works, our results are obtained without resorting to the\ntraditionally used but stringent conditions such as independent and identically\ndistributed (i.i.d) assumption on the data, and thus our results do not exclude\napplications to stochastic dynamical systems with feedback. A numerical example\nis also provided to illustrate the superiority of the proposed online algorithm\nover the existing related ones in the literature.\n","authors":["Lantian Zhang","Lei Guo"],"pdf_url":"https://arxiv.org/pdf/2309.09454v2.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2306.05079v2","updated":"2023-10-01T13:01:07Z","published":"2023-06-08T10:02:04Z","title":"Enhancing Robustness of AI Offensive Code Generators via Data\n  Augmentation","summary":"  In this work, we present a method to add perturbations to the code\ndescriptions to create new inputs in natural language (NL) from\nwell-intentioned developers that diverge from the original ones due to the use\nof new words or because they miss part of them. The goal is to analyze how and\nto what extent perturbations affect the performance of AI code generators in\nthe context of security-oriented code. First, we show that perturbed\ndescriptions preserve the semantics of the original, non-perturbed ones. Then,\nwe use the method to assess the robustness of three state-of-the-art code\ngenerators against the newly perturbed inputs, showing that the performance of\nthese AI-based solutions is highly affected by perturbations in the NL\ndescriptions. To enhance their robustness, we use the method to perform data\naugmentation, i.e., to increase the variability and diversity of the NL\ndescriptions in the training data, proving its effectiveness against both\nperturbed and non-perturbed code descriptions.\n","authors":["Cristina Improta","Pietro Liguori","Roberto Natella","Bojan Cukic","Domenico Cotroneo"],"pdf_url":"https://arxiv.org/pdf/2306.05079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03980v2","updated":"2023-10-01T12:57:01Z","published":"2023-07-08T14:08:37Z","title":"Building and Road Segmentation Using EffUNet and Transfer Learning\n  Approach","summary":"  In city, information about urban objects such as water supply, railway lines,\npower lines, buildings, roads, etc., is necessary for city planning. In\nparticular, information about the spread of these objects, locations and\ncapacity is needed for the policymakers to make impactful decisions. This\nthesis aims to segment the building and roads from the aerial image captured by\nthe satellites and UAVs. Many different architectures have been proposed for\nthe semantic segmentation task and UNet being one of them. In this thesis, we\npropose a novel architecture based on Google's newly proposed EfficientNetV2 as\nan encoder for feature extraction with UNet decoder for constructing the\nsegmentation map. Using this approach we achieved a benchmark score for the\nMassachusetts Building and Road dataset with an mIOU of 0.8365 and 0.9153\nrespectively.\n","authors":["Sahil Gangurde"],"pdf_url":"https://arxiv.org/pdf/2307.03980v2.pdf","comment":"The transformer network analysis was not included in the current\n  paper"},{"id":"http://arxiv.org/abs/2307.03723v2","updated":"2023-10-01T11:21:37Z","published":"2023-07-06T16:44:03Z","title":"Steel Surface Roughness Parameter Calculations Using Lasers and Machine\n  Learning Models","summary":"  Control of surface texture in strip steel is essential to meet customer\nrequirements during galvanizing and temper rolling processes. Traditional\nmethods rely on post-production stylus measurements, while on-line techniques\noffer non-contact and real-time measurements of the entire strip. However,\nensuring accurate measurement is imperative for their effective utilization in\nthe manufacturing pipeline. Moreover, accurate on-line measurements enable\nreal-time adjustments of manufacturing processing parameters during production,\nensuring consistent quality and the possibility of closed-loop control of the\ntemper mill. In this study, we leverage state-of-the-art machine learning\nmodels to enhance the transformation of on-line measurements into significantly\na more accurate Ra surface roughness metric. By comparing a selection of\ndata-driven approaches, including both deep learning and non-deep learning\nmethods, to the close-form transformation, we evaluate their potential for\nimproving surface texture control in temper strip steel manufacturing.\n","authors":["Alex Milne","Xianghua Xie"],"pdf_url":"https://arxiv.org/pdf/2307.03723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12856v2","updated":"2023-10-01T10:30:27Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web automation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that learns from self-experience to\ncomplete tasks on real websites following natural language instructions.\nWebAgent plans ahead by decomposing instructions into canonical\nsub-instructions, summarizes long HTML documents into task-relevant snippets,\nand acts on websites via Python programs generated from those. We design\nWebAgent with Flan-U-PaLM, for grounded code generation, and HTML-T5, new\npre-trained LLMs for long HTML documents using local and global attention\nmechanisms and a mixture of long-span denoising objectives, for planning and\nsummarization. We empirically demonstrate that our modular recipe improves the\nsuccess on real websites by over 50%, and that HTML-T5 is the best model to\nsolve various HTML understanding tasks; achieving 18.7% higher success rate\nthan the prior method on MiniWoB web automation benchmark, and SoTA performance\non Mind2Web, an offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11854v2","updated":"2023-10-01T10:15:01Z","published":"2023-05-19T17:44:34Z","title":"Multimodal Web Navigation with Instruction-Finetuned Foundation Models","summary":"  The progress of autonomous web navigation has been hindered by the dependence\non billions of exploratory interactions via online reinforcement learning, and\ndomain-specific model designs that make it difficult to leverage generalization\nfrom rich out-of-domain data. In this work, we study data-driven offline\ntraining for web agents with vision-language foundation models. We propose an\ninstruction-following multimodal agent, WebGUM, that observes both webpage\nscreenshots and HTML pages and outputs web navigation actions, such as click\nand type. WebGUM is trained by jointly finetuning an instruction-finetuned\nlanguage model and a vision encoder with temporal and local perception on a\nlarge corpus of demonstrations. We empirically demonstrate this recipe improves\nthe agent's ability of grounded multimodal perception, HTML comprehension, and\nmulti-step reasoning, outperforming prior works by a significant margin. On the\nMiniWoB, we improve over the previous best offline methods by more than 45.8%,\neven outperforming online-finetuned SoTA, humans, and GPT-4-based agent. On the\nWebShop benchmark, our 3-billion-parameter model achieves superior performance\nto the existing SoTA, PaLM-540B. Furthermore, WebGUM exhibits strong positive\ntransfer to the real-world planning tasks on the Mind2Web. We also collect 347K\nhigh-quality demonstrations using our trained models, 38 times larger than\nprior work, and make them available to promote future research in this\ndirection.\n","authors":["Hiroki Furuta","Kuang-Huei Lee","Ofir Nachum","Yutaka Matsuo","Aleksandra Faust","Shixiang Shane Gu","Izzeddin Gur"],"pdf_url":"https://arxiv.org/pdf/2305.11854v2.pdf","comment":"Website: https://sites.google.com/view/mm-webnav/"},{"id":"http://arxiv.org/abs/2308.08493v2","updated":"2023-10-01T09:11:03Z","published":"2023-08-16T16:48:57Z","title":"Time Travel in LLMs: Tracing Data Contamination in Large Language Models","summary":"  Data contamination, i.e., the presence of test data from downstream tasks in\nthe training data of large language models (LLMs), is a potential major issue\nin measuring LLMs' real effectiveness on other tasks. We propose a\nstraightforward yet effective method for identifying data contamination within\nLLMs. At its core, our approach starts by identifying potential contamination\nat the instance level; using this information, our approach then assesses wider\ncontamination at the partition level. To estimate contamination of individual\ninstances, we employ \"guided instruction:\" a prompt consisting of the dataset\nname, partition type, and the random-length initial segment of a reference\ninstance, asking the LLM to complete it. An instance is flagged as contaminated\nif the LLM's output either exactly or nearly matches the latter segment of the\nreference. To understand if an entire partition is contaminated, we propose two\nideas. The first idea marks a dataset partition as contaminated if the average\noverlap score with the reference instances (as measured by ROUGE-L or BLEURT)\nis statistically significantly better with the completions from guided\ninstruction compared to a \"general instruction\" that does not include the\ndataset and partition name. The second idea marks a dataset partition as\ncontaminated if a classifier based on GPT-4 with few-shot in-context learning\nprompt marks multiple generated completions as exact/near-exact matches of the\ncorresponding reference instances. Our best method achieves an accuracy between\n92% and 100% in detecting if an LLM is contaminated with seven datasets,\ncontaining train and test/validation partitions, when contrasted with manual\nevaluation by human experts. Further, our findings indicate that GPT-4 is\ncontaminated with AG News, WNLI, and XSum datasets.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2308.08493v2.pdf","comment":"v2 preprint"},{"id":"http://arxiv.org/abs/2303.00315v2","updated":"2023-10-01T08:13:39Z","published":"2023-03-01T08:24:54Z","title":"Efficient Explorative Key-term Selection Strategies for Conversational\n  Contextual Bandits","summary":"  Conversational contextual bandits elicit user preferences by occasionally\nquerying for explicit feedback on key-terms to accelerate learning. However,\nthere are aspects of existing approaches which limit their performance. First,\ninformation gained from key-term-level conversations and arm-level\nrecommendations is not appropriately incorporated to speed up learning. Second,\nit is important to ask explorative key-terms to quickly elicit the user's\npotential interests in various domains to accelerate the convergence of user\npreference estimation, which has never been considered in existing works. To\ntackle these issues, we first propose ``ConLinUCB\", a general framework for\nconversational bandits with better information incorporation, combining\narm-level and key-term-level feedback to estimate user preference in one step\nat each time. Based on this framework, we further design two bandit algorithms\nwith explorative key-term selection strategies, ConLinUCB-BS and ConLinUCB-MCR.\nWe prove tighter regret upper bounds of our proposed algorithms. Particularly,\nConLinUCB-BS achieves a regret bound of $O(d\\sqrt{T\\log T})$, better than the\nprevious result $O(d\\sqrt{T}\\log T)$. Extensive experiments on synthetic and\nreal-world data show significant advantages of our algorithms in learning\naccuracy (up to 54\\% improvement) and computational efficiency (up to 72\\%\nimprovement), compared to the classic ConUCB algorithm, showing the potential\nbenefit to recommender systems.\n","authors":["Zhiyong Wang","Xutong Liu","Shuai Li","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2303.00315v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15852v2","updated":"2023-10-01T07:22:39Z","published":"2023-05-25T08:43:46Z","title":"Self-contradictory Hallucinations of Large Language Models: Evaluation,\n  Detection and Mitigation","summary":"  Large language models (large LMs) are susceptible to producing text that\ncontains hallucinated content. An important instance of this problem is\nself-contradiction, where the LM generates two contradictory sentences within\nthe same context. In this work, we present a comprehensive investigation into\nself-contradiction for various instruction-tuned LMs, covering evaluation,\ndetection, and mitigation. Our analysis reveals the prevalence of\nself-contradictions when LMs generate text for open-domain topics, e.g., in\n17.7% of all sentences produced by ChatGPT. Self-contradiction also complements\nretrieval-based methods, as a large portion of them (e.g., 35.8% for ChatGPT)\ncannot be verified using Wikipedia. We then propose a novel prompting-based\nframework designed to effectively detect and mitigate self-contradictions. Our\ndetector achieves high accuracy, e.g., around 80% F1 score when prompting\nChatGPT. The mitigation algorithm iteratively refines the generated text to\nremove contradictory information while preserving text fluency and\ninformativeness. Importantly, our entire framework is applicable to black-box\nLMs and does not require external grounded knowledge. Our approach is\npractically effective and has been released as a push-button tool to benefit\nthe public, available at https://chatprotect.ai/.\n","authors":["Niels Mündler","Jingxuan He","Slobodan Jenko","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2305.15852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12718v2","updated":"2023-10-01T06:33:18Z","published":"2023-05-22T04:54:10Z","title":"HighLight: Efficient and Flexible DNN Acceleration with Hierarchical\n  Structured Sparsity","summary":"  Due to complex interactions among various deep neural network (DNN)\noptimization techniques, modern DNNs can have weights and activations that are\ndense or sparse with diverse sparsity degrees. To offer a good trade-off\nbetween accuracy and hardware performance, an ideal DNN accelerator should have\nhigh flexibility to efficiently translate DNN sparsity into reductions in\nenergy and/or latency without incurring significant complexity overhead.\n  This paper introduces hierarchical structured sparsity (HSS), with the key\ninsight that we can systematically represent diverse sparsity degrees by having\nthem hierarchically composed from multiple simple sparsity patterns. As a\nresult, HSS simplifies the underlying hardware since it only needs to support\nsimple sparsity patterns; this significantly reduces the sparsity acceleration\noverhead, which improves efficiency. Motivated by such opportunities, we\npropose a simultaneously efficient and flexible accelerator, named HighLight,\nto accelerate DNNs that have diverse sparsity degrees (including dense). Due to\nthe flexibility of HSS, different HSS patterns can be introduced to DNNs to\nmeet different applications' accuracy requirements. Compared to existing works,\nHighLight achieves a geomean of up to 6.4x better energy-delay product (EDP)\nacross workloads with diverse sparsity degrees, and always sits on the\nEDP-accuracy Pareto frontier for representative DNNs\n","authors":["Yannan Nellie Wu","Po-An Tsai","Saurav Muralidharan","Angshuman Parashar","Vivienne Sze","Joel S. Emer"],"pdf_url":"https://arxiv.org/pdf/2305.12718v2.pdf","comment":"Accepted to MICRO23"},{"id":"http://arxiv.org/abs/2210.17237v3","updated":"2023-10-01T04:48:00Z","published":"2022-10-31T11:43:05Z","title":"Latent Multimodal Functional Graphical Model Estimation","summary":"  Joint multimodal functional data acquisition, where functional data from\nmultiple modes are measured simultaneously from the same subject, has emerged\nas an exciting modern approach enabled by recent engineering breakthroughs in\nthe neurological and biological sciences. One prominent motivation to acquire\nsuch data is to enable new discoveries of the underlying connectivity by\ncombining multimodal signals. Despite the scientific interest, there remains a\ngap in principled statistical methods for estimating the graph underlying\nmultimodal functional data. To this end, we propose a new integrative framework\nthat models the data generation process and identifies operators mapping from\nthe observation space to the latent space. We then develop an estimator that\nsimultaneously estimates the transformation operators and the latent graph.\nThis estimator is based on the partial correlation operator, which we\nrigorously extend from the multivariate to the functional setting. Our\nprocedure is provably efficient, with the estimator converging to a stationary\npoint with quantifiable statistical error. Furthermore, we show recovery of the\nlatent graph under mild conditions. Our work is applied to analyze\nsimultaneously acquired multimodal brain imaging data where the graph indicates\nfunctional connectivity of the brain. We present simulation and empirical\nresults that support the benefits of joint estimation.\n","authors":["Katherine Tsai","Boxin Zhao","Sanmi Koyejo","Mladen Kolar"],"pdf_url":"https://arxiv.org/pdf/2210.17237v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04903v2","updated":"2023-10-01T03:41:45Z","published":"2023-02-09T19:10:57Z","title":"AdaptSim: Task-Driven Simulation Adaptation for Sim-to-Real Transfer","summary":"  Simulation parameter settings such as contact models and object geometry\napproximations are critical to training robust robotic policies capable of\ntransferring from simulation to real-world deployment. Previous approaches\ntypically handcraft distributions over such parameters (domain randomization),\nor identify parameters that best match the dynamics of the real environment\n(system identification). However, there is often an irreducible gap between\nsimulation and reality: attempting to match the dynamics between simulation and\nreality across all states and tasks may be infeasible and may not lead to\npolicies that perform well in reality for a specific task. Addressing this\nissue, we propose AdaptSim, a new task-driven adaptation framework for\nsim-to-real transfer that aims to optimize task performance in target (real)\nenvironments -- instead of matching dynamics between simulation and reality.\nFirst, we meta-learn an adaptation policy in simulation using reinforcement\nlearning for adjusting the simulation parameter distribution based on the\ncurrent policy's performance in a target environment. We then perform iterative\nreal-world adaptation by inferring new simulation parameter distributions for\npolicy training, using a small amount of real data. We perform experiments in\nthree robotic tasks: (1) swing-up of linearized double pendulum, (2) dynamic\ntable-top pushing of a bottle, and (3) dynamic scooping of food pieces with a\nspatula. Our extensive simulation and hardware experiments demonstrate AdaptSim\nachieving 1-3x asymptotic performance and $\\sim$2x real data efficiency when\nadapting to different environments, compared to methods based on Sys-ID and\ndirectly training the task policy in target environments. Website:\nhttps://irom-lab.github.io/AdaptSim/\n","authors":["Allen Z. Ren","Hongkai Dai","Benjamin Burchfiel","Anirudha Majumdar"],"pdf_url":"https://arxiv.org/pdf/2302.04903v2.pdf","comment":"Conference on Robot Learning (CoRL), 2023"},{"id":"http://arxiv.org/abs/2309.01026v2","updated":"2023-10-01T02:57:42Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n  Multimodal Nudging","summary":"  We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel M. Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.02935v2","updated":"2023-10-01T02:55:32Z","published":"2022-09-07T05:08:34Z","title":"Normalised clustering accuracy: An asymmetric external cluster validity\n  measure","summary":"  There is no, nor will there ever be, single best clustering algorithm, but we\nwould still like to be able to distinguish between methods which work well on\ncertain task types and those that systematically underperform. Clustering\nalgorithms are traditionally evaluated using either internal or external\nvalidity measures. Internal measures quantify different aspects of the obtained\npartitions, e.g., the average degree of cluster compactness or point\nseparability. Yet, their validity is questionable, because the clusterings they\npromote can sometimes be meaningless. External measures, on the other hand,\ncompare the algorithms' outputs to the reference, ground truth groupings that\nare provided by experts. In this paper, we argue that the commonly-used\nclassical partition similarity scores, such as the normalised mutual\ninformation, Fowlkes-Mallows, or adjusted Rand index, miss some desirable\nproperties, e.g., they do not identify worst-case scenarios correctly or are\nnot easily interpretable. This makes comparing clustering algorithms across\nmany benchmark datasets difficult. To remedy these issues, we propose and\nanalyse a new measure: a version of the optimal set-matching accuracy, which is\nnormalised, monotonic, scale invariant, and corrected for the imbalancedness of\ncluster sizes (but neither symmetric nor adjusted for chance).\n","authors":["Marek Gagolewski"],"pdf_url":"https://arxiv.org/pdf/2209.02935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14681v2","updated":"2023-10-01T02:52:00Z","published":"2023-09-26T05:10:08Z","title":"Are Human-generated Demonstrations Necessary for In-context Learning?","summary":"  Despite the promising few-shot ability of large language models (LLMs), the\nstandard paradigm of In-context Learning (ICL) suffers the disadvantages of\nsusceptibility to selected demonstrations and the intricacy to generate these\ndemonstrations. In this paper, we raise the fundamental question that whether\nhuman-generated demonstrations are necessary for ICL. To answer this question,\nwe propose self-contemplation prompting strategy (SEC), a paradigm free from\nhuman-crafted demonstrations. The key point of SEC is that, instead of using\nhand-crafted examples as demonstrations in ICL, SEC asks LLMs to first create\ndemonstrations on their own, based on which the final output is generated. SEC\nis a flexible framework and can be adapted to both the vanilla ICL and the\nchain-of-thought (CoT), but with greater ease: as the manual-generation process\nof both examples and rationale can be saved. Extensive experiments in\narithmetic reasoning, commonsense reasoning, multi-task language understanding,\nand code generation benchmarks, show that SEC, which does not require\nhand-crafted demonstrations, significantly outperforms the zero-shot learning\nstrategy, and achieves comparable results to ICL with hand-crafted\ndemonstrations. This demonstrates that, for many tasks, contemporary LLMs\npossess a sufficient level of competence to exclusively depend on their own\ncapacity for decision making, removing the need for external training data.\nCode is available at https://github.com/ruili33/SEC.\n","authors":["Rui Li","Guoyin Wang","Jiwei Li"],"pdf_url":"https://arxiv.org/pdf/2309.14681v2.pdf","comment":"Pre-print Version"},{"id":"http://arxiv.org/abs/2301.03150v3","updated":"2023-10-01T02:43:49Z","published":"2023-01-09T02:42:39Z","title":"MOTOR: A Time-To-Event Foundation Model For Structured Medical Records","summary":"  We present a self-supervised, time-to-event (TTE) foundation model called\nMOTOR (Many Outcome Time Oriented Representations) which is pretrained on\ntimestamped sequences of events in electronic health records (EHR) and health\ninsurance claims. TTE models are used for estimating the probability\ndistribution of the time until a specific event occurs, which is an important\ntask in medical settings. TTE models provide many advantages over\nclassification using fixed time horizons, including naturally handling censored\nobservations, but are challenging to train with limited labeled data. MOTOR\naddresses this challenge by pretraining on up to 55M patient records (9B\nclinical events). We evaluate MOTOR's transfer learning performance on 19\ntasks, across 3 patient databases (a private EHR system, MIMIC-IV, and Merative\nclaims data). Task-specific models adapted from MOTOR improve time-dependent C\nstatistics by 4.6% over state-of-the-art, improve label efficiency by up to 95%\n,and are more robust to temporal distributional shifts. We further evaluate\ncross-site portability by adapting our MOTOR foundation model for six\nprediction tasks on the MIMIC-IV dataset, where it outperforms all baselines.\nMOTOR is the first foundation model for medical TTE predictions and we release\na 143M parameter pretrained model for research use at [redacted URL].\n","authors":["Ethan Steinberg","Yizhe Xu","Jason Fries","Nigam Shah"],"pdf_url":"https://arxiv.org/pdf/2301.03150v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16454v2","updated":"2023-10-01T02:31:33Z","published":"2023-03-29T04:43:03Z","title":"Conductivity Imaging from Internal Measurements with Mixed Least-Squares\n  Deep Neural Networks","summary":"  In this work we develop a novel approach using deep neural networks to\nreconstruct the conductivity distribution in elliptic problems from one\nmeasurement of the solution over the whole domain. The approach is based on a\nmixed reformulation of the governing equation and utilizes the standard\nleast-squares objective, with deep neural networks as ansatz functions to\napproximate the conductivity and flux simultaneously. We provide a thorough\nanalysis of the deep neural network approximations of the conductivity for both\ncontinuous and empirical losses, including rigorous error estimates that are\nexplicit in terms of the noise level, various penalty parameters and neural\nnetwork architectural parameters (depth, width and parameter bound). We also\nprovide multiple numerical experiments in two- and multi-dimensions to\nillustrate distinct features of the approach, e.g., excellent stability with\nrespect to data noise and capability of solving high-dimensional problems.\n","authors":["Bangti Jin","Xiyao Li","Qimeng Quan","Zhi Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.16454v2.pdf","comment":"39 pages. 20 figures"},{"id":"http://arxiv.org/abs/2302.09693v2","updated":"2023-10-01T02:19:50Z","published":"2023-02-19T23:27:12Z","title":"mSAM: Micro-Batch-Averaged Sharpness-Aware Minimization","summary":"  Modern deep learning models are over-parameterized, where different optima\ncan result in widely varying generalization performance. The Sharpness-Aware\nMinimization (SAM) technique modifies the fundamental loss function that steers\ngradient descent methods toward flatter minima, which are believed to exhibit\nenhanced generalization prowess. Our study delves into a specific variant of\nSAM known as micro-batch SAM (mSAM). This variation involves aggregating\nupdates derived from adversarial perturbations across multiple shards\n(micro-batches) of a mini-batch during training. We extend a recently developed\nand well-studied general framework for flatness analysis to theoretically show\nthat SAM achieves flatter minima than SGD, and mSAM achieves even flatter\nminima than SAM. We provide a thorough empirical evaluation of various image\nclassification and natural language processing tasks to substantiate this\ntheoretical advancement. We also show that contrary to previous work, mSAM can\nbe implemented in a flexible and parallelizable manner without significantly\nincreasing computational costs. Our implementation of mSAM yields superior\ngeneralization performance across a wide range of tasks compared to SAM,\nfurther supporting our theoretical framework.\n","authors":["Kayhan Behdin","Qingquan Song","Aman Gupta","Sathiya Keerthi","Ayan Acharya","Borja Ocejo","Gregory Dexter","Rajiv Khanna","David Durfee","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2302.09693v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2212.04343"},{"id":"http://arxiv.org/abs/2309.06034v2","updated":"2023-10-01T02:16:03Z","published":"2023-09-12T08:06:04Z","title":"Normality Learning-based Graph Anomaly Detection via Multi-Scale\n  Contrastive Learning","summary":"  Graph anomaly detection (GAD) has attracted increasing attention in machine\nlearning and data mining. Recent works have mainly focused on how to capture\nricher information to improve the quality of node embeddings for GAD. Despite\ntheir significant advances in detection performance, there is still a relative\ndearth of research on the properties of the task. GAD aims to discern the\nanomalies that deviate from most nodes. However, the model is prone to learn\nthe pattern of normal samples which make up the majority of samples. Meanwhile,\nanomalies can be easily detected when their behaviors differ from normality.\nTherefore, the performance can be further improved by enhancing the ability to\nlearn the normal pattern. To this end, we propose a normality learning-based\nGAD framework via multi-scale contrastive learning networks (NLGAD for\nabbreviation). Specifically, we first initialize the model with the contrastive\nnetworks on different scales. To provide sufficient and reliable normal nodes\nfor normality learning, we design an effective hybrid strategy for normality\nselection. Finally, the model is refined with the only input of reliable normal\nnodes and learns a more accurate estimate of normality so that anomalous nodes\ncan be more easily distinguished. Eventually, extensive experiments on six\nbenchmark graph datasets demonstrate the effectiveness of our normality\nlearning-based scheme on GAD. Notably, the proposed algorithm improves the\ndetection performance (up to 5.89% AUC gain) compared with the state-of-the-art\nmethods. The source code is released at https://github.com/FelixDJC/NLGAD.\n","authors":["Jingcan Duan","Pei Zhang","Siwei Wang","Jingtao Hu","Hu Jin","Jiaxin Zhang","Haifang Zhou","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06034v2.pdf","comment":"10 pages, 7 figures, accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2211.15255v3","updated":"2023-10-01T02:09:49Z","published":"2022-11-28T12:17:40Z","title":"ARISE: Graph Anomaly Detection on Attributed Networks via Substructure\n  Awareness","summary":"  Recently, graph anomaly detection on attributed networks has attracted\ngrowing attention in data mining and machine learning communities. Apart from\nattribute anomalies, graph anomaly detection also aims at suspicious\ntopological-abnormal nodes that exhibit collective anomalous behavior. Closely\nconnected uncorrelated node groups form uncommonly dense substructures in the\nnetwork. However, existing methods overlook that the topology anomaly detection\nperformance can be improved by recognizing such a collective pattern. To this\nend, we propose a new graph anomaly detection framework on attributed networks\nvia substructure awareness (ARISE for abbreviation). Unlike previous\nalgorithms, we focus on the substructures in the graph to discern\nabnormalities. Specifically, we establish a region proposal module to discover\nhigh-density substructures in the network as suspicious regions. The average\nnode-pair similarity can be regarded as the topology anomaly degree of nodes\nwithin substructures. Generally, the lower the similarity, the higher the\nprobability that internal nodes are topology anomalies. To distill better\nembeddings of node attributes, we further introduce a graph contrastive\nlearning scheme, which observes attribute anomalies in the meantime. In this\nway, ARISE can detect both topology and attribute anomalies. Ultimately,\nextensive experiments on benchmark datasets show that ARISE greatly improves\ndetection performance (up to 7.30% AUC and 17.46% AUPRC gains) compared to\nstate-of-the-art attributed networks anomaly detection (ANAD) algorithms.\n","authors":["Jingcan Duan","Bin Xiao","Siwei Wang","Haifang Zhou","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2211.15255v3.pdf","comment":"13 pages, 8 figures, accepted by IEEE TNNLS"},{"id":"http://arxiv.org/abs/2309.13414v2","updated":"2023-10-01T01:55:39Z","published":"2023-09-23T15:55:12Z","title":"State-space Models with Layer-wise Nonlinearity are Universal\n  Approximators with Exponential Decaying Memory","summary":"  State-space models have gained popularity in sequence modelling due to their\nsimple and efficient network structures. However, the absence of nonlinear\nactivation along the temporal direction limits the model's capacity. In this\npaper, we prove that stacking state-space models with layer-wise nonlinear\nactivation is sufficient to approximate any continuous sequence-to-sequence\nrelationship. Our findings demonstrate that the addition of layer-wise\nnonlinear activation enhances the model's capacity to learn complex sequence\npatterns. Meanwhile, it can be seen both theoretically and empirically that the\nstate-space models do not fundamentally resolve the exponential decaying memory\nissue. Theoretical results are justified by numerical verifications.\n","authors":["Shida Wang","Beichen Xue"],"pdf_url":"https://arxiv.org/pdf/2309.13414v2.pdf","comment":"17 pages, 6 figures,"},{"id":"http://arxiv.org/abs/2304.02858v2","updated":"2023-10-01T01:54:15Z","published":"2023-04-06T04:37:10Z","title":"A review of ensemble learning and data augmentation models for class\n  imbalanced problems: combination, implementation and evaluation","summary":"  Class imbalance (CI) in classification problems arises when the number of\nobservations belonging to one class is lower than the other. Ensemble learning\ncombines multiple models to obtain a robust model and has been prominently used\nwith data augmentation methods to address class imbalance problems. In the last\ndecade, a number of strategies have been added to enhance ensemble learning and\ndata augmentation methods, along with new methods such as generative\nadversarial networks (GANs). A combination of these has been applied in many\nstudies, but the true rank of different combinations would require a\ncomputational review.\n  In this paper, we present a computational review to evaluate data\naugmentation and ensemble learning methods used to address prominent benchmark\nCI problems. We present a general framework that evaluates 10 data augmentation\nand 10 ensemble learning methods for CI problems. Our objective is to identify\nthe most effective combination for improving classification performance on\nimbalanced datasets. The results indicate that combinations of data\naugmentation methods with ensemble learning can significantly improve\nclassification performance on imbalanced datasets. Our study is vital for the\ndevelopment of novel models for handling imbalanced datasets.\n","authors":["Azal Ahmad Khan","Omkar Chaudhari","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2304.02858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05904v8","updated":"2023-10-01T00:50:34Z","published":"2022-06-13T05:15:12Z","title":"Superiority of GNN over NN in generalizing bandlimited functions","summary":"  Graph Neural Networks (GNNs) have emerged as formidable resources for\nprocessing graph-based information across diverse applications. While the\nexpressive power of GNNs has traditionally been examined in the context of\ngraph-level tasks, their potential for node-level tasks, such as node\nclassification, where the goal is to interpolate missing node labels from the\nobserved ones, remains relatively unexplored. In this study, we investigate the\nproficiency of GNNs for such classifications, which can also be cast as a\nfunction interpolation problem. Explicitly, we focus on ascertaining the\noptimal configuration of weights and layers required for a GNN to successfully\ninterpolate a band-limited function over Euclidean cubes. Our findings\nhighlight a pronounced efficiency in utilizing GNNs to generalize a bandlimited\nfunction within an $\\varepsilon$-error margin. Remarkably, achieving this task\nnecessitates only $O_d((\\log\\varepsilon^{-1})^d)$ weights and\n$O_d((\\log\\varepsilon^{-1})^d)$ training samples. We explore how this criterion\nstacks up against the explicit constructions of currently available Neural\nNetworks (NNs) designed for similar tasks. Significantly, our result is\nobtained by drawing an innovative connection between the GNN structures and\nclassical sampling theorems. In essence, our pioneering work marks a meaningful\ncontribution to the research domain, advancing our understanding of the\npractical GNN applications.\n","authors":["A. Martina Neuman","Rongrong Wang","Yuying Xie"],"pdf_url":"https://arxiv.org/pdf/2206.05904v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02614v2","updated":"2023-10-01T00:29:44Z","published":"2023-05-04T07:43:40Z","title":"High-dimensional Bayesian Optimization via Semi-supervised Learning with\n  Optimized Unlabeled Data Sampling","summary":"  Bayesian optimization (BO) is a powerful sequential optimization approach for\nseeking the global optimum of black-box functions for sample efficiency\npurposes. Evaluations of black-box functions can be expensive, rendering\nreduced use of labeled data desirable. For the first time, we introduce a\nteacher-student model, called $\\texttt{TSBO}$, to enable semi-supervised\nlearning that can make use of large amounts of cheaply generated unlabeled data\nunder the context of BO to enhance the generalization of data query models. Our\nteacher-student model is uncertainty-aware and offers a practical mechanism for\nleveraging the pseudo labels generated for unlabeled data while dealing with\nthe involved risk. We show that the selection of unlabeled data is key to\n$\\texttt{TSBO}$. We optimize unlabeled data sampling by generating unlabeled\ndata from a dynamically fitted extreme value distribution or a parameterized\nsampling distribution learned by minimizing the student feedback.\n$\\texttt{TSBO}$ is capable of operating in a learned latent space with reduced\ndimensionality, providing scalability to high-dimensional problems.\n$\\texttt{TSBO}$ demonstrates the significant sample efficiency in several\nglobal optimization tasks under tight labeled data budgets.\n","authors":["Yuxuan Yin","Yu Wang","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2305.02614v2.pdf","comment":"15 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2303.17550v4","updated":"2023-10-01T11:20:26Z","published":"2023-03-30T17:18:31Z","title":"DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with\n  Diffusion Autoencoder","summary":"  While recent research has made significant progress in speech-driven talking\nface generation, the quality of the generated video still lags behind that of\nreal recordings. One reason for this is the use of handcrafted intermediate\nrepresentations like facial landmarks and 3DMM coefficients, which are designed\nbased on human knowledge and are insufficient to precisely describe facial\nmovements. Additionally, these methods require an external pretrained model for\nextracting these representations, whose performance sets an upper bound on\ntalking face generation. To address these limitations, we propose a novel\nmethod called DAE-Talker that leverages data-driven latent representations\nobtained from a diffusion autoencoder (DAE). DAE contains an image encoder that\nencodes an image into a latent vector and a DDIM image decoder that\nreconstructs the image from it. We train our DAE on talking face video frames\nand then extract their latent representations as the training target for a\nConformer-based speech2latent model. This allows DAE-Talker to synthesize full\nvideo frames and produce natural head movements that align with the content of\nspeech, rather than relying on a predetermined head pose from a template video.\nWe also introduce pose modelling in speech2latent for pose controllability.\nAdditionally, we propose a novel method for generating continuous video frames\nwith the DDIM image decoder trained on individual frames, eliminating the need\nfor modelling the joint distribution of consecutive frames directly. Our\nexperiments show that DAE-Talker outperforms existing popular methods in\nlip-sync, video fidelity, and pose naturalness. We also conduct ablation\nstudies to analyze the effectiveness of the proposed techniques and demonstrate\nthe pose controllability of DAE-Talker.\n","authors":["Chenpeng Du","Qi Chen","Tianyu He","Xu Tan","Xie Chen","Kai Yu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.17550v4.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2309.01026v2","updated":"2023-10-01T02:57:42Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n  Multimodal Nudging","summary":"  We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel M. Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00788v1","updated":"2023-10-01T20:55:13Z","published":"2023-10-01T20:55:13Z","title":"Web Image Formats: Assessment of Their Real-World-Usage and Performance\n  across Popular Web Browsers","summary":"  In 2023, images on the web make up 41% of transmitted data, significantly\nimpacting the performance of web apps. Fortunately, image formats like WEBP and\nAVIF could offer advanced compression and faster page loading, but may face\nperformance disparities across browsers. Therefore, we conducted performance\nevaluations on five major browsers - Chrome, Edge, Safari, Opera, and Firefox -\nwhile comparing four image formats. The results indicate that the newer formats\nexhibited notable performance enhancements across all browsers, leading to\nshorter loading times. Compared to the compressed JPEG format, WEBP and AVIF\nimproved the Page Load Time by 21% and 15%, respectively. However, web scraping\nrevealed that JPEG and PNG still dominate web image choices, with WEBP at 4% as\nthe most used new format. Through the web scraping and web performance\nevaluation, this research serves to (1) explore image format preferences in web\napplications and analyze distribution and characteristics across\nfrequently-visited sites in 2023 and (2) assess the performance impact of\ndistinct web image formats on application load times across popular web\nbrowsers.\n","authors":["Benedikt Dornauer","Michael Felderer"],"pdf_url":"https://arxiv.org/pdf/2310.00788v1.pdf","comment":"Preprint: Product-Focused Software Process Improvement 24th\n  International Conference, PROFES 2023, Dornbirn, Austria , Dezember 10-13,\n  2023, Proceedings"},{"id":"http://arxiv.org/abs/2310.00647v1","updated":"2023-10-01T12:02:59Z","published":"2023-10-01T12:02:59Z","title":"Beyond Task Performance: Evaluating and Reducing the Flaws of Large\n  Multimodal Models with In-Context Learning","summary":"  Following the success of Large Language Models (LLMs), Large Multimodal\nModels (LMMs), such as the Flamingo model and its subsequent competitors, have\nstarted to emerge as natural steps towards generalist agents. However,\ninteracting with recent LMMs reveals major limitations that are hardly captured\nby the current evaluation benchmarks. Indeed, task performances (e.g., VQA\naccuracy) alone do not provide enough clues to understand their real\ncapabilities, limitations, and to which extent such models are aligned to human\nexpectations. To refine our understanding of those flaws, we deviate from the\ncurrent evaluation paradigm and propose the EvALign-ICL framework, in which we\n(1) evaluate 8 recent open-source LMMs (based on the Flamingo architecture such\nas OpenFlamingo and IDEFICS) on 5 different axes; hallucinations, abstention,\ncompositionality, explainability and instruction following. Our evaluation on\nthese axes reveals major flaws in LMMs. To efficiently address these problems,\nand inspired by the success of in-context learning (ICL) in LLMs, (2) we\nexplore ICL as a solution and study how it affects these limitations. Based on\nour ICL study, (3) we push ICL further and propose new multimodal ICL\napproaches such as; Multitask-ICL, Chain-of-Hindsight-ICL, and\nSelf-Correcting-ICL. Our findings are as follows; (1) Despite their success,\nLMMs have flaws that remain unsolved with scaling alone. (2) The effect of ICL\non LMMs flaws is nuanced; despite its effectiveness for improved\nexplainability, abstention, and instruction following, ICL does not improve\ncompositional abilities, and actually even amplifies hallucinations. (3) The\nproposed ICL variants are promising as post-hoc approaches to efficiently\ntackle some of those flaws. The code is available here:\nhttps://evalign-icl.github.io/\n","authors":["Mustafa Shukor","Alexandre Rame","Corentin Dancette","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2310.00647v1.pdf","comment":"Project Page: https://evalign-icl.github.io/"},{"id":"http://arxiv.org/abs/2310.00559v1","updated":"2023-10-01T03:29:21Z","published":"2023-10-01T03:29:21Z","title":"CPIPS: Learning to Preserve Perceptual Distances in End-to-End Image\n  Compression","summary":"  Lossy image coding standards such as JPEG and MPEG have successfully achieved\nhigh compression rates for human consumption of multimedia data. However, with\nthe increasing prevalence of IoT devices, drones, and self-driving cars,\nmachines rather than humans are processing a greater portion of captured visual\ncontent. Consequently, it is crucial to pursue an efficient compressed\nrepresentation that caters not only to human vision but also to image\nprocessing and machine vision tasks. Drawing inspiration from the efficient\ncoding hypothesis in biological systems and the modeling of the sensory cortex\nin neural science, we repurpose the compressed latent representation to\nprioritize semantic relevance while preserving perceptual distance. Our\nproposed method, Compressed Perceptual Image Patch Similarity (CPIPS), can be\nderived at a minimal cost from a learned neural codec and computed\nsignificantly faster than DNN-based perceptual metrics such as LPIPS and DISTS.\n","authors":["Chen-Hsiu Huang","Ja-Ling Wu"],"pdf_url":"https://arxiv.org/pdf/2310.00559v1.pdf","comment":"7 pages, 5 figures; accepted by APSIPA ASC 2023"}]},"2023-09-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.10819v2","updated":"2023-09-30T19:14:45Z","published":"2023-08-17T06:21:50Z","title":"Evaluating the Instruction-Following Robustness of Large Language Models\n  to Prompt Injection","summary":"  Large Language Models (LLMs) have shown remarkable proficiency in following\ninstructions, making them valuable in customer-facing applications. However,\ntheir impressive capabilities also raise concerns about the amplification of\nrisks posed by adversarial instructions, which can be injected into the model\ninput by third-party attackers to manipulate LLMs' original instructions and\nprompt unintended actions and content. Therefore, it is crucial to understand\nLLMs' ability to accurately discern which instructions to follow to ensure\ntheir safe deployment in real-world scenarios. In this paper, we propose a\npioneering benchmark for automatically evaluating the robustness of\ninstruction-following LLMs against adversarial instructions injected in the\nprompt. The objective of this benchmark is to quantify the extent to which LLMs\nare influenced by injected adversarial instructions and assess their ability to\ndifferentiate between these injected adversarial instructions and original user\ninstructions. Through experiments conducted with state-of-the-art\ninstruction-following LLMs, we uncover significant limitations in their\nrobustness against adversarial instruction injection attacks. Furthermore, our\nfindings indicate that prevalent instruction-tuned models are prone to being\n``overfitted'' to follow any instruction phrase in the prompt without truly\nunderstanding which instructions should be followed. This highlights the need\nto address the challenge of training models to comprehend prompts instead of\nmerely following instruction phrases and completing the text. The data and code\ncan be found at \\url{https://github.com/Leezekun/Adv-Instruct-Eval}.\n","authors":["Zekun Li","Baolin Peng","Pengcheng He","Xifeng Yan"],"pdf_url":"https://arxiv.org/pdf/2308.10819v2.pdf","comment":"The data and code can be found at\n  https://github.com/Leezekun/Adv-Instruct-Eval"},{"id":"http://arxiv.org/abs/2309.12491v2","updated":"2023-09-30T19:00:32Z","published":"2023-09-21T21:21:55Z","title":"Exploring the Impact of Training Data Distribution and Subword\n  Tokenization on Gender Bias in Machine Translation","summary":"  We study the effect of tokenization on gender bias in machine translation, an\naspect that has been largely overlooked in previous works. Specifically, we\nfocus on the interactions between the frequency of gendered profession names in\ntraining data, their representation in the subword tokenizer's vocabulary, and\ngender bias. We observe that female and non-stereotypical gender inflections of\nprofession names (e.g., Spanish \"doctora\" for \"female doctor\") tend to be split\ninto multiple subword tokens. Our results indicate that the imbalance of gender\nforms in the model's training corpus is a major factor contributing to gender\nbias and has a greater impact than subword splitting. We show that analyzing\nsubword splits provides good estimates of gender-form imbalance in the training\ndata and can be used even when the corpus is not publicly available. We also\ndemonstrate that fine-tuning just the token embedding layer can decrease the\ngap in gender prediction accuracy between female and male forms without\nimpairing the translation quality.\n","authors":["Bar Iluz","Tomasz Limisiewicz","Gabriel Stanovsky","David Mareček"],"pdf_url":"https://arxiv.org/pdf/2309.12491v2.pdf","comment":"Accepted to AACL 2023"},{"id":"http://arxiv.org/abs/2309.13702v2","updated":"2023-09-30T18:56:04Z","published":"2023-09-24T17:19:36Z","title":"Skill Check: Some Considerations on the Evaluation of Gamemastering\n  Models for Role-playing Games","summary":"  In role-playing games a Game Master (GM) is the player in charge of the game,\nwho must design the challenges the players face and narrate the outcomes of\ntheir actions. In this work we discuss some challenges to model GMs from an\nInteractive Storytelling and Natural Language Processing perspective. Following\nthose challenges we propose three test categories to evaluate such dialogue\nsystems, and we use them to test ChatGPT, Bard and OpenAssistant as\nout-of-the-box GMs.\n","authors":["Santiago Góngora","Luis Chiruzzo","Gonzalo Méndez","Pablo Gervás"],"pdf_url":"https://arxiv.org/pdf/2309.13702v2.pdf","comment":"11 pages. Accepted at GALA 2023 (Games and Learning Alliance 12th\n  International Conference)"},{"id":"http://arxiv.org/abs/2309.11054v2","updated":"2023-09-30T15:51:38Z","published":"2023-09-20T04:17:28Z","title":"Design of Chain-of-Thought in Math Problem Solving","summary":"  Chain-of-Thought (CoT) plays a crucial role in reasoning for math problem\nsolving. We conduct a comprehensive examination of methods for designing CoT,\ncomparing conventional natural language CoT with various program CoTs,\nincluding the self-describing program, the comment-describing program, and the\nnon-describing program. Furthermore, we investigate the impact of programming\nlanguage on program CoTs, comparing Python and Wolfram Language. Through\nextensive experiments on GSM8K, MATHQA, and SVAMP, we find that program CoTs\noften have superior effectiveness in math problem solving. Notably, the best\nperforming combination with 30B parameters beats GPT-3.5-turbo by a significant\nmargin. The results show that self-describing program offers greater diversity\nand thus can generally achieve higher performance. We also find that Python is\na better choice of language than Wolfram for program CoTs. The experimental\nresults provide a valuable guideline for future CoT designs that take into\naccount both programming language and coding style for further advancements.\nOur datasets and code are publicly available.\n","authors":["Zhanming Jie","Trung Quoc Luong","Xinbo Zhang","Xiaoran Jin","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2309.11054v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2305.15090v2","updated":"2023-09-30T08:37:28Z","published":"2023-05-24T12:15:19Z","title":"STAR: Improving Low-Resource Information Extraction by Structure-to-Text\n  Data Generation with Large Language Models","summary":"  Information extraction tasks such as event extraction require an in-depth\nunderstanding of the output structure and sub-task dependencies. They heavily\nrely on task-specific training data in the form of (passage, target structure)\npairs to obtain reasonable performance. However, obtaining such data through\nhuman annotation is costly, leading to a pressing need for low-resource\ninformation extraction approaches that require minimal human labeling for\nreal-world applications. Fine-tuning supervised models with synthesized\ntraining data would be a generalizable method, but the existing data generation\nmethods either still rely on large-scale ground-truth data or cannot be applied\nto complicated IE tasks due to their poor performance. To address these\nchallenges, we propose STAR, a data generation method that leverages Large\nLanguage Models (LLMs) to synthesize data instances given limited seed\ndemonstrations, thereby boosting low-resource information extraction\nperformance. Our approach involves generating target structures (Y) followed by\ngenerating passages (X), all accomplished with the aid of LLMs. We design\nfine-grained step-by-step instructions to obtain the initial data instances. We\nfurther reduce errors and improve data quality through self-reflection error\nidentification and self-refinement with iterative revision. Our experiments\nshow that the data generated by STAR significantly improves the performance of\nlow-resource event extraction and relation extraction tasks, even surpassing\nthe effectiveness of human-curated data. Human assessment of the data quality\nshows STAR-generated data exhibits higher passage quality and better align with\nthe task definitions compared with the human-curated data.\n","authors":["Mingyu Derek Ma","Xiaoxuan Wang","Po-Nien Kung","P. Jeffrey Brantingham","Nanyun Peng","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.15090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11738v2","updated":"2023-09-30T08:35:29Z","published":"2023-05-19T15:19:44Z","title":"CRITIC: Large Language Models Can Self-Correct with Tool-Interactive\n  Critiquing","summary":"  Recent developments in large language models (LLMs) have been impressive.\nHowever, these models sometimes show inconsistencies and problematic behavior,\nsuch as hallucinating facts, generating flawed code, or creating offensive and\ntoxic content. Unlike these models, humans typically utilize external tools to\ncross-check and refine their initial content, like using a search engine for\nfact-checking, or a code interpreter for debugging. Inspired by this\nobservation, we introduce a framework called CRITIC that allows LLMs, which are\nessentially \"black boxes\" to validate and progressively amend their own outputs\nin a manner similar to human interaction with tools. More specifically,\nstarting with an initial output, CRITIC interacts with appropriate tools to\nevaluate certain aspects of the text, and then revises the output based on the\nfeedback obtained during this validation process. Comprehensive evaluations\ninvolving free-form question answering, mathematical program synthesis, and\ntoxicity reduction demonstrate that CRITIC consistently enhances the\nperformance of LLMs. Meanwhile, our research highlights the crucial importance\nof external feedback in promoting the ongoing self-improvement of LLMs.\n","authors":["Zhibin Gou","Zhihong Shao","Yeyun Gong","Yelong Shen","Yujiu Yang","Nan Duan","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2305.11738v2.pdf","comment":"add LLaMA-2 7B to 70B results; add more mathematical program\n  synthesis datasets"},{"id":"http://arxiv.org/abs/2305.10865v2","updated":"2023-09-30T08:27:28Z","published":"2023-05-18T10:37:54Z","title":"Semantically Aligned Task Decomposition in Multi-Agent Reinforcement\n  Learning","summary":"  The difficulty of appropriately assigning credit is particularly heightened\nin cooperative MARL with sparse reward, due to the concurrent time and\nstructural scales involved. Automatic subgoal generation (ASG) has recently\nemerged as a viable MARL approach inspired by utilizing subgoals in\nintrinsically motivated reinforcement learning. However, end-to-end learning of\ncomplex task planning from sparse rewards without prior knowledge, undoubtedly\nrequires massive training samples. Moreover, the diversity-promoting nature of\nexisting ASG methods can lead to the \"over-representation\" of subgoals,\ngenerating numerous spurious subgoals of limited relevance to the actual task\nreward and thus decreasing the sample efficiency of the algorithm. To address\nthis problem and inspired by the disentangled representation learning, we\npropose a novel \"disentangled\" decision-making method, Semantically Aligned\ntask decomposition in MARL (SAMA), that prompts pretrained language models with\nchain-of-thought that can suggest potential goals, provide suitable goal\ndecomposition and subgoal allocation as well as self-reflection-based\nreplanning. Additionally, SAMA incorporates language-grounded RL to train each\nagent's subgoal-conditioned policy. SAMA demonstrates considerable advantages\nin sample efficiency compared to state-of-the-art ASG methods, as evidenced by\nits performance on two challenging sparse-reward tasks, Overcooked and MiniRTS.\n","authors":["Wenhao Li","Dan Qiao","Baoxiang Wang","Xiangfeng Wang","Bo Jin","Hongyuan Zha"],"pdf_url":"https://arxiv.org/pdf/2305.10865v2.pdf","comment":"54 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.12030v2","updated":"2023-09-30T07:54:22Z","published":"2023-08-23T09:43:10Z","title":"Prompt-Based Length Controlled Generation with Reinforcement Learning","summary":"  Large language models (LLMs) like ChatGPT and GPT-4 have attracted great\nattention given their surprising performance on a wide range of NLP tasks.\nLength controlled generation of LLMs emerges as an important topic, which\nenables users to fully leverage the capability of LLMs in more real-world\nscenarios like generating a proper answer or essay of a desired length. In\naddition, the autoregressive generation in LLMs is extremely time-consuming,\nwhile the ability of controlling this generated length can reduce the inference\ncost by limiting the length. Therefore, we propose a prompt-based length\ncontrol method to achieve high-accuracy length controlled generation. In\nparticular, we adopt reinforcement learning with the reward signal given by\neither trainable or rule-based reward models, which further enhances the\nlength-control ability of LLMs by rewarding outputs that follows pre-defined\ncontrol instruction. To enable rule-based inference, we also introduce standard\nprompt extractor to collect the standard control information from users' input.\nExperiments show that our method significantly improves the accuracy of\nprompt-based length control for summarization task on popular datasets like\nCNNDM and NYT. Both the standard prompt extractor and the RL-tuned model have\nshow strong generalization ability to unseen control prompt templates.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16298v2","updated":"2023-09-30T07:01:13Z","published":"2023-09-28T09:50:27Z","title":"At Which Training Stage Does Code Data Help LLMs Reasoning?","summary":"  Large Language Models (LLMs) have exhibited remarkable reasoning capabilities\nand become the foundation of language technologies. Inspired by the great\nsuccess of code data in training LLMs, we naturally wonder at which training\nstage introducing code data can really help LLMs reasoning. To this end, this\npaper systematically explores the impact of code data on LLMs at different\nstages. Concretely, we introduce the code data at the pre-training stage,\ninstruction-tuning stage, and both of them, respectively. Then, the reasoning\ncapability of LLMs is comprehensively and fairly evaluated via six reasoning\ntasks in five domains. We critically analyze the experimental results and\nprovide conclusions with insights. First, pre-training LLMs with the mixture of\ncode and text can significantly enhance LLMs' general reasoning capability\nalmost without negative transfer on other tasks. Besides, at the\ninstruction-tuning stage, code data endows LLMs the task-specific reasoning\ncapability. Moreover, the dynamic mixing strategy of code and text data assists\nLLMs to learn reasoning capability step-by-step during training. These insights\ndeepen the understanding of LLMs regarding reasoning ability for their\napplication, such as scientific question answering, legal support, etc. The\nsource code and model parameters are released at the\nlink:~\\url{https://github.com/yingweima2022/CodeLLM}.\n","authors":["Yingwei Ma","Yue Liu","Yue Yu","Yuanliang Zhang","Yu Jiang","Changjian Wang","Shanshan Li"],"pdf_url":"https://arxiv.org/pdf/2309.16298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07697v3","updated":"2023-09-30T05:00:58Z","published":"2023-07-15T03:31:38Z","title":"Think-on-Graph: Deep and Responsible Reasoning of Large Language Model\n  on Knowledge Graph","summary":"  Although large language models (LLMs) have achieved significant success in\nvarious tasks, they often struggle with hallucination problems, especially in\nscenarios requiring deep and responsible reasoning. These issues could be\npartially addressed by introducing external knowledge graphs (KG) in LLM\nreasoning. In this paper, we propose a new LLM-KG integrating paradigm\n``$\\hbox{LLM}\\otimes\\hbox{KG}$'' which treats the LLM as an agent to\ninteractively explore related entities and relations on KGs and perform\nreasoning based on the retrieved knowledge. We further implement this paradigm\nby introducing a new approach called Think-on-Graph (ToG), in which the LLM\nagent iteratively executes beam search on KG, discovers the most promising\nreasoning paths, and returns the most likely reasoning results. We use a number\nof well-designed experiments to examine and illustrate the following advantages\nof ToG: 1) compared with LLMs, ToG has better deep reasoning power; 2) ToG has\nthe ability of knowledge traceability and knowledge correctability by\nleveraging LLMs reasoning and expert feedback; 3) ToG provides a flexible\nplug-and-play framework for different LLMs, KGs and prompting strategies\nwithout any additional training cost; 4) the performance of ToG with small LLM\nmodels could exceed large LLM such as GPT-4 in certain scenarios and this\nreduces the cost of LLM deployment and application. As a training-free method\nwith lower computational cost and better generality, ToG achieves overall SOTA\nin 6 out of 9 datasets where most previous SOTAs rely on additional training.\n","authors":["Jiashuo Sun","Chengjin Xu","Lumingyuan Tang","Saizhuo Wang","Chen Lin","Yeyun Gong","Lionel M. Ni","Heung-Yeung Shum","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2307.07697v3.pdf","comment":"30 pages, 13 figures, 20 tables"},{"id":"http://arxiv.org/abs/2309.13061v2","updated":"2023-09-30T04:31:22Z","published":"2023-09-11T18:05:12Z","title":"Applying BioBERT to Extract Germline Gene-Disease Associations for\n  Building a Knowledge Graph from the Biomedical Literature","summary":"  Published biomedical information has and continues to rapidly increase. The\nrecent advancements in Natural Language Processing (NLP), have generated\nconsiderable interest in automating the extraction, normalization, and\nrepresentation of biomedical knowledge about entities such as genes and\ndiseases. Our study analyzes germline abstracts in the construction of\nknowledge graphs of the of the immense work that has been done in this area for\ngenes and diseases. This paper presents SimpleGermKG, an automatic knowledge\ngraph construction approach that connects germline genes and diseases. For the\nextraction of genes and diseases, we employ BioBERT, a pre-trained BERT model\non biomedical corpora. We propose an ontology-based and rule-based algorithm to\nstandardize and disambiguate medical terms. For semantic relationships between\narticles, genes, and diseases, we implemented a part-whole relation approach to\nconnect each entity with its data source and visualize them in a graph-based\nknowledge representation. Lastly, we discuss the knowledge graph applications,\nlimitations, and challenges to inspire the future research of germline corpora.\nOur knowledge graph contains 297 genes, 130 diseases, and 46,747 triples.\nGraph-based visualizations are used to show the results.\n","authors":["Armando D. Diaz Gonzalez","Songhui Yue","Sean T. Hayes","Kevin S. Hughes"],"pdf_url":"https://arxiv.org/pdf/2309.13061v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.08701v2","updated":"2023-09-30T02:59:34Z","published":"2023-07-17T17:59:40Z","title":"AlpaGasus: Training A Better Alpaca with Fewer Data","summary":"  Large language models~(LLMs) strengthen instruction-following capability\nthrough instruction-finetuning (IFT) on supervised instruction/response data.\nHowever, widely used IFT datasets (e.g., Alpaca's 52k data) surprisingly\ncontain many low-quality instances with incorrect or irrelevant responses,\nwhich are misleading and detrimental to IFT. In this paper, we propose a simple\nand effective data selection strategy that automatically identifies and filters\nout low-quality data using a strong LLM (e.g., ChatGPT). To this end, we\nintroduce AlpaGasus, which is finetuned on only 9k high-quality data filtered\nfrom the 52k Alpaca data. AlpaGasus significantly outperforms the original\nAlpaca as evaluated by GPT-4 on multiple test sets and the controlled human\nevaluation. Its 13B variant matches $>90\\%$ performance of its teacher LLM\n(i.e., Text-Davinci-003 generating the 52k data) on test tasks. It also\nprovides 5.7x faster training, reducing the training time for a 7B variant from\n80 minutes (for Alpaca) to 14 minutes. Moreover, the experiments prove the\nefficacy of our method across diverse datasets, base models, and LLM filters.\nOverall, AlpaGasus demonstrates a novel data-centric IFT paradigm that can be\ngenerally applied to instruction-tuning data, leading to faster training and\nbetter instruction-following models. Our project page is available at:\n\\url{https://lichang-chen.github.io/AlpaGasus/}\n","authors":["Lichang Chen","Shiyang Li","Jun Yan","Hai Wang","Kalpa Gunaratna","Vikas Yadav","Zheng Tang","Vijay Srinivasan","Tianyi Zhou","Heng Huang","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08701v2.pdf","comment":"32 Pages; 29 Figures; 15 Tables"},{"id":"http://arxiv.org/abs/2305.13330v2","updated":"2023-09-30T02:16:31Z","published":"2023-05-19T01:59:20Z","title":"Unsupervised ASR via Cross-Lingual Pseudo-Labeling","summary":"  Recent work has shown that it is possible to train an $\\textit{unsupervised}$\nautomatic speech recognition (ASR) system using only unpaired audio and text.\nExisting unsupervised ASR methods assume that no labeled data can be used for\ntraining. We argue that even if one does not have any labeled audio for a given\nlanguage, there is $\\textit{always}$ labeled data available for other\nlanguages. We show that it is possible to use character-level acoustic models\n(AMs) from other languages to bootstrap an $\\textit{unsupervised}$ AM in a new\nlanguage. Here, \"unsupervised\" means no labeled audio is available for the\n$\\textit{target}$ language. Our approach is based on two key ingredients: (i)\ngenerating pseudo-labels (PLs) of the $\\textit{target}$ language using some\n$\\textit{other}$ language AM and (ii) constraining these PLs with a\n$\\textit{target language model}$. Our approach is effective on Common Voice:\ne.g. transfer of English AM to Swahili achieves 18% WER. It also outperforms\ncharacter-based wav2vec-U 2.0 by 15% absolute WER on LJSpeech with 800h of\nlabeled German data instead of 60k hours of unlabeled English data.\n","authors":["Tatiana Likhomanenko","Loren Lugosch","Ronan Collobert"],"pdf_url":"https://arxiv.org/pdf/2305.13330v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2309.11998v3","updated":"2023-09-30T00:30:51Z","published":"2023-09-21T12:13:55Z","title":"LMSYS-Chat-1M: A Large-Scale Real-World LLM Conversation Dataset","summary":"  Studying how people interact with large language models (LLMs) in real-world\nscenarios is increasingly important due to their widespread use in various\napplications. In this paper, we introduce LMSYS-Chat-1M, a large-scale dataset\ncontaining one million real-world conversations with 25 state-of-the-art LLMs.\nThis dataset is collected from 210K unique IP addresses in the wild on our\nVicuna demo and Chatbot Arena website. We offer an overview of the dataset's\ncontent, including its curation process, basic statistics, and topic\ndistribution, highlighting its diversity, originality, and scale. We\ndemonstrate its versatility through four use cases: developing content\nmoderation models that perform similarly to GPT-4, building a safety benchmark,\ntraining instruction-following models that perform similarly to Vicuna, and\ncreating challenging benchmark questions. We believe that this dataset will\nserve as a valuable resource for understanding and advancing LLM capabilities.\nThe dataset is publicly available at\nhttps://huggingface.co/datasets/lmsys/lmsys-chat-1m.\n","authors":["Lianmin Zheng","Wei-Lin Chiang","Ying Sheng","Tianle Li","Siyuan Zhuang","Zhanghao Wu","Yonghao Zhuang","Zhuohan Li","Zi Lin","Eric. P Xing","Joseph E. Gonzalez","Ion Stoica","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.11998v3.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2305.06568v2","updated":"2023-09-30T23:15:01Z","published":"2023-05-11T05:02:11Z","title":"Convolutional Neural Networks Rarely Learn Shape for Semantic\n  Segmentation","summary":"  Shape learning, or the ability to leverage shape information, could be a\ndesirable property of convolutional neural networks (CNNs) when target objects\nhave specific shapes. While some research on the topic is emerging, there is no\nsystematic study to conclusively determine whether and under what circumstances\nCNNs learn shape. Here, we present such a study in the context of segmentation\nnetworks where shapes are particularly important. We define shape and propose a\nnew behavioral metric to measure the extent to which a CNN utilizes shape\ninformation. We then execute a set of experiments with synthetic and real-world\ndata to progressively uncover under which circumstances CNNs learn shape and\nwhat can be done to encourage such behavior. We conclude that (i) CNNs do not\nlearn shape in typical settings but rather rely on other features available to\nidentify the objects of interest, (ii) CNNs can learn shape, but only if the\nshape is the only feature available to identify the object, (iii) sufficiently\nlarge receptive field size relative to the size of target objects is necessary\nfor shape learning; (iv) a limited set of augmentations can encourage shape\nlearning; (v) learning shape is indeed useful in the presence of\nout-of-distribution data.\n","authors":["Yixin Zhang","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2305.06568v2.pdf","comment":"Accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2305.10616v3","updated":"2023-09-30T22:44:48Z","published":"2023-05-18T00:04:38Z","title":"Evaluation Metrics for DNNs Compression","summary":"  There is a lot of ongoing research effort into developing different\ntechniques for neural networks compression. However, the community lacks\nstandardised evaluation metrics, which are key to identifying the most suitable\ncompression technique for different applications. This paper reviews existing\nneural network compression evaluation metrics and implements them into a\nstandardisation framework called NetZIP. We introduce two novel metrics to\ncover existing gaps of evaluation in the literature: 1) Compression and\nHardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success\n(OCS). We demonstrate the use of NetZIP using two case studies on two different\nhardware platforms (a PC and a Raspberry Pi 4) focusing on object\nclassification and object detection.\n","authors":["Abanoub Ghobrial","Samuel Budgett","Dieter Balemans","Hamid Asgari","Phil Reiter","Kerstin Eder"],"pdf_url":"https://arxiv.org/pdf/2305.10616v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07384v2","updated":"2023-09-30T21:26:44Z","published":"2022-11-14T14:11:31Z","title":"Language models are good pathologists: using attention-based sequence\n  reduction and text-pretrained transformers for efficient WSI classification","summary":"  In digital pathology, Whole Slide Image (WSI) analysis is usually formulated\nas a Multiple Instance Learning (MIL) problem. Although transformer-based\narchitectures have been used for WSI classification, these methods require\nmodifications to adapt them to specific challenges of this type of image data.\nAmong these challenges is the amount of memory and compute required by deep\ntransformer models to process long inputs, such as the thousands of image\npatches that can compose a WSI at $\\times 10$ or $\\times 20$ magnification. We\nintroduce \\textit{SeqShort}, a multi-head attention-based sequence shortening\nlayer to summarize each WSI in a fixed- and short-sized sequence of instances,\nthat allows us to reduce the computational costs of self-attention on long\nsequences, and to include positional information that is unavailable in other\nMIL approaches. Furthermore, we show that WSI classification performance can be\nimproved when the downstream transformer architecture has been pre-trained on a\nlarge corpus of text data, and only fine-tuning less than 0.1\\% of its\nparameters. We demonstrate the effectiveness of our method in lymph node\nmetastases classification and cancer subtype classification tasks, without the\nneed of designing a WSI-specific transformer nor doing in-domain pre-training,\nkeeping a reduced compute budget and low number of trainable parameters.\n","authors":["Juan I. Pisula","Katarzyna Bozek"],"pdf_url":"https://arxiv.org/pdf/2211.07384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14428v2","updated":"2023-09-30T21:05:21Z","published":"2023-05-23T18:00:22Z","title":"Prompting Language-Informed Distribution for Compositional Zero-Shot\n  Learning","summary":"  Compositional zero-shot learning (CZSL) task aims to recognize unseen\ncompositional visual concepts, e.g., sliced tomatoes, where the model is\nlearned only from the seen compositions, e.g., sliced potatoes and red\ntomatoes. Thanks to the prompt tuning on large pre-trained visual language\nmodels such as CLIP, recent literature shows impressively better CZSL\nperformance than traditional vision-based methods. However, the key aspects\nthat impact the generalization to unseen compositions, including the diversity\nand informativeness of class context, and the entanglement between visual\nprimitives, i.e., state and object, are not properly addressed in existing\nCLIP-based CZSL literature. In this paper, we propose a model by prompting the\nlanguage-informed distribution, aka., PLID, for the CZSL task. Specifically,\nthe PLID leverages pre-trained large language models (LLM) to 1) formulate the\nlanguage-informed class distributions which are diverse and informative, and 2)\nenhance the compositionality of the class embedding. Moreover, a\nvisual-language primitive decomposition (VLPD) module and a stochastic logit\nmixup (SLM) strategy are proposed to dynamically fuse the decisions from the\ncompositional and the primitive logit space. Orthogonal to the existing\nliterature of soft, hard, or distributional prompts, our method advocates\nprompting the LLM-supported class distribution that leads to a better zero-shot\ngeneralization. Experimental results on MIT-States, UT-Zappos, and C-GQA\ndatasets show the superior performance of the PLID to the prior arts.\n","authors":["Wentao Bao","Lichang Chen","Heng Huang","Yu Kong"],"pdf_url":"https://arxiv.org/pdf/2305.14428v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12286v3","updated":"2023-09-30T21:01:28Z","published":"2023-05-20T21:38:05Z","title":"Low-Earth Satellite Orbit Determination Using Deep Convolutional\n  Networks with Satellite Imagery","summary":"  Given the critical roles that satellites play in national defense, public\nsafety, and worldwide communications, finding ways to determine satellite\ntrajectories is a crucially important task for improved space situational\nawareness. However, it is increasingly common for satellites to lose connection\nto the ground stations with which they communicate due to signal interruptions\nfrom the Earth's ionosphere and magnetosphere, among other interferences. In\nthis work, we propose utilizing a computer vision based approach that relies on\nimages of the Earth taken by the satellite in real-time to predict its orbit\nupon losing contact with ground stations. In contrast with other works, we\ntrain neural networks on an image-based dataset and show that the neural\nnetworks outperform the de facto standard in orbit determination (the Kalman\nfilter) in the scenario where the satellite has lost connection with its\nground-based station. Moreover, our approach does not require $\\textit{a\npriori}$ knowledge of the satellite's state and it takes into account the\nexternal factors influencing the satellite's motion using images taken in\nreal-time.\n","authors":["Rohit Khorana"],"pdf_url":"https://arxiv.org/pdf/2305.12286v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13495v3","updated":"2023-09-30T18:58:41Z","published":"2023-05-22T21:25:27Z","title":"Type-to-Track: Retrieve Any Object via Prompt-based Tracking","summary":"  One of the recent trends in vision problems is to use natural language\ncaptions to describe the objects of interest. This approach can overcome some\nlimitations of traditional methods that rely on bounding boxes or category\nannotations. This paper introduces a novel paradigm for Multiple Object\nTracking called Type-to-Track, which allows users to track objects in videos by\ntyping natural language descriptions. We present a new dataset for that\nGrounded Multiple Object Tracking task, called GroOT, that contains videos with\nvarious types of objects and their corresponding textual captions describing\ntheir appearance and action in detail. Additionally, we introduce two new\nevaluation protocols and formulate evaluation metrics specifically for this\ntask. We develop a new efficient method that models a transformer-based\neMbed-ENcoDE-extRact framework (MENDER) using the third-order tensor\ndecomposition. The experiments in five scenarios show that our MENDER approach\noutperforms another two-stage design in terms of accuracy and efficiency, up to\n14.7% accuracy and 4$\\times$ speed faster.\n","authors":["Pha Nguyen","Kha Gia Quach","Kris Kitani","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.13495v3.pdf","comment":"Accepted at NeurIPS 2023. Project page:\n  https://uark-cviu.github.io/Type-to-Track/"},{"id":"http://arxiv.org/abs/2306.06093v2","updated":"2023-09-30T17:49:45Z","published":"2023-06-09T17:56:07Z","title":"HyP-NeRF: Learning Improved NeRF Priors using a HyperNetwork","summary":"  Neural Radiance Fields (NeRF) have become an increasingly popular\nrepresentation to capture high-quality appearance and shape of scenes and\nobjects. However, learning generalizable NeRF priors over categories of scenes\nor objects has been challenging due to the high dimensionality of network\nweight space. To address the limitations of existing work on generalization,\nmulti-view consistency and to improve quality, we propose HyP-NeRF, a latent\nconditioning method for learning generalizable category-level NeRF priors using\nhypernetworks. Rather than using hypernetworks to estimate only the weights of\na NeRF, we estimate both the weights and the multi-resolution hash encodings\nresulting in significant quality gains. To improve quality even further, we\nincorporate a denoise and finetune strategy that denoises images rendered from\nNeRFs estimated by the hypernetwork and finetunes it while retaining multiview\nconsistency. These improvements enable us to use HyP-NeRF as a generalizable\nprior for multiple downstream tasks including NeRF reconstruction from\nsingle-view or cluttered scenes and text-to-NeRF. We provide qualitative\ncomparisons and evaluate HyP-NeRF on three tasks: generalization, compression,\nand retrieval, demonstrating our state-of-the-art results.\n","authors":["Bipasha Sen","Gaurav Singh","Aditya Agarwal","Rohith Agaram","K Madhava Krishna","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2306.06093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13006v2","updated":"2023-09-30T15:29:50Z","published":"2023-03-23T03:02:09Z","title":"Controllable Inversion of Black-Box Face Recognition Models via\n  Diffusion","summary":"  Face recognition models embed a face image into a low-dimensional identity\nvector containing abstract encodings of identity-specific facial features that\nallow individuals to be distinguished from one another. We tackle the\nchallenging task of inverting the latent space of pre-trained face recognition\nmodels without full model access (i.e. black-box setting). A variety of methods\nhave been proposed in literature for this task, but they have serious\nshortcomings such as a lack of realistic outputs and strong requirements for\nthe data set and accessibility of the face recognition model. By analyzing the\nblack-box inversion problem, we show that the conditional diffusion model loss\nnaturally emerges and that we can effectively sample from the inverse\ndistribution even without an identity-specific loss. Our method, named identity\ndenoising diffusion probabilistic model (ID3PM), leverages the stochastic\nnature of the denoising diffusion process to produce high-quality,\nidentity-preserving face images with various backgrounds, lighting, poses, and\nexpressions. We demonstrate state-of-the-art performance in terms of identity\npreservation and diversity both qualitatively and quantitatively, and our\nmethod is the first black-box face recognition model inversion method that\noffers intuitive control over the generation process.\n","authors":["Manuel Kansy","Anton Raël","Graziana Mignone","Jacek Naruniec","Christopher Schroers","Markus Gross","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2303.13006v2.pdf","comment":"8 pages main paper + 23 pages supplementary material. Moderate\n  revisions from v1 (different template, added user study, wording). Presented\n  at AMFG workshop at ICCV 2023. Project page:\n  https://studios.disneyresearch.com/2023/10/02/controllable-inversion-of-black-box-face-recognition-models-via-diffusion/"},{"id":"http://arxiv.org/abs/2306.07879v2","updated":"2023-09-30T15:12:07Z","published":"2023-06-13T16:14:40Z","title":"Rethinking pose estimation in crowds: overcoming the detection\n  information-bottleneck and ambiguity","summary":"  Frequent interactions between individuals are a fundamental challenge for\npose estimation algorithms. Current pipelines either use an object detector\ntogether with a pose estimator (top-down approach), or localize all body parts\nfirst and then link them to predict the pose of individuals (bottom-up). Yet,\nwhen individuals closely interact, top-down methods are ill-defined due to\noverlapping individuals, and bottom-up methods often falsely infer connections\nto distant bodyparts. Thus, we propose a novel pipeline called bottom-up\nconditioned top-down pose estimation (BUCTD) that combines the strengths of\nbottom-up and top-down methods. Specifically, we propose to use a bottom-up\nmodel as the detector, which in addition to an estimated bounding box provides\na pose proposal that is fed as condition to an attention-based top-down model.\nWe demonstrate the performance and efficiency of our approach on animal and\nhuman pose estimation benchmarks. On CrowdPose and OCHuman, we outperform\nprevious state-of-the-art models by a significant margin. We achieve 78.5 AP on\nCrowdPose and 48.5 AP on OCHuman, an improvement of 8.6% and 7.8% over the\nprior art, respectively. Furthermore, we show that our method strongly improves\nthe performance on multi-animal benchmarks involving fish and monkeys. The code\nis available at https://github.com/amathislab/BUCTD\n","authors":["Mu Zhou","Lucas Stoffl","Mackenzie Weygandt Mathis","Alexander Mathis"],"pdf_url":"https://arxiv.org/pdf/2306.07879v2.pdf","comment":"Published at ICCV 2023; Code at https://github.com/amathislab/BUCTD\n  Video at https://www.youtube.com/watch?v=BHZnA-CZeZY"},{"id":"http://arxiv.org/abs/2303.10310v4","updated":"2023-09-30T14:38:44Z","published":"2023-03-18T02:42:18Z","title":"Domain-knowledge Inspired Pseudo Supervision (DIPS) for Unsupervised\n  Image-to-Image Translation Models to Support Cross-Domain Classification","summary":"  The ability to classify images is dependent on having access to large labeled\ndatasets and testing on data from the same domain that the model can train on.\nClassification becomes more challenging when dealing with new data from a\ndifferent domain, where gathering and especially labeling a larger image\ndataset for retraining a classification model requires a labor-intensive human\neffort. Cross-domain classification frameworks were developed to handle this\ndata domain shift problem by utilizing unsupervised image-to-image translation\nmodels to translate an input image from the unlabeled domain to the labeled\ndomain. The problem with these unsupervised models lies in their unsupervised\nnature. For lack of annotations, it is not possible to use the traditional\nsupervised metrics to evaluate these translation models to pick the best-saved\ncheckpoint model. This paper introduces a new method called Domain-knowledge\nInspired Pseudo Supervision (DIPS) which utilizes domain-informed Gaussian\nMixture Models to generate pseudo annotations to enable the use of traditional\nsupervised metrics. This method was designed specifically to support\ncross-domain classification applications contrary to other typically used\nmetrics such as the FID which were designed to evaluate the model in terms of\nthe quality of the generated image from a human-eye perspective. DIPS proves\nits effectiveness by outperforming various GAN evaluation metrics, including\nFID, when selecting the optimal saved checkpoint model. It is also evaluated\nagainst truly supervised metrics. Furthermore, DIPS showcases its robustness\nand interpretability by demonstrating a strong correlation with truly\nsupervised metrics, highlighting its superiority over existing state-of-the-art\nalternatives. The code and data to replicate the results can be found on the\nofficial Github repository: https://github.com/Hindawi91/DIPS\n","authors":["Firas Al-Hindawi","Md Mahfuzur Rahman Siddiquee","Teresa Wu","Han Hu","Ying Sun"],"pdf_url":"https://arxiv.org/pdf/2303.10310v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.09107"},{"id":"http://arxiv.org/abs/2304.14065v3","updated":"2023-09-30T13:47:02Z","published":"2023-04-27T09:52:35Z","title":"Lightweight, Pre-trained Transformers for Remote Sensing Timeseries","summary":"  Machine learning models for parsing remote sensing data have a wide range of\nsocietally relevant applications, but labels used to train these models can be\ndifficult or impossible to acquire. This challenge has spurred research into\nself-supervised learning for remote sensing data aiming to unlock the use of\nmachine learning in geographies or application domains where labelled datasets\nare small. Current self-supervised learning approaches for remote sensing data\ndraw significant inspiration from techniques applied to natural images.\nHowever, remote sensing data has important differences from natural images --\nfor example, the temporal dimension is critical for many tasks and data is\ncollected from many complementary sensors. We show we can create significantly\nsmaller performant models by designing architectures and self-supervised\ntraining techniques specifically for remote sensing data. We introduce the\nPretrained Remote Sensing Transformer (Presto), a transformer-based model\npre-trained on remote sensing pixel-timeseries data. Presto excels at a wide\nvariety of globally distributed remote sensing tasks and performs competitively\nwith much larger models while requiring far less compute. Presto can be used\nfor transfer learning or as a feature extractor for simple models, enabling\nefficient deployment at scale.\n","authors":["Gabriel Tseng","Ruben Cartuyvels","Ivan Zvonkov","Mirali Purohit","David Rolnick","Hannah Kerner"],"pdf_url":"https://arxiv.org/pdf/2304.14065v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05013v3","updated":"2023-09-30T13:11:29Z","published":"2022-09-12T04:07:34Z","title":"Learning A Locally Unified 3D Point Cloud for View Synthesis","summary":"  In this paper, we explore the problem of 3D point cloud representation-based\nview synthesis from a set of sparse source views. To tackle this challenging\nproblem, we propose a new deep learning-based view synthesis paradigm that\nlearns a locally unified 3D point cloud from source views. Specifically, we\nfirst construct sub-point clouds by projecting source views to 3D space based\non their depth maps. Then, we learn the locally unified 3D point cloud by\nadaptively fusing points at a local neighborhood defined on the union of the\nsub-point clouds. Besides, we also propose a 3D geometry-guided image\nrestoration module to fill the holes and recover high-frequency details of the\nrendered novel views. Experimental results on three benchmark datasets\ndemonstrate that our method can improve the average PSNR by more than 4 dB\nwhile preserving more accurate visual details, compared with state-of-the-art\nview synthesis methods.\n","authors":["Meng You","Mantang Guo","Xianqiang Lyu","Hui Liu","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2209.05013v3.pdf","comment":"Accepted to TIP"},{"id":"http://arxiv.org/abs/2304.06022v4","updated":"2023-09-30T13:01:52Z","published":"2023-04-12T17:58:03Z","title":"SAM Struggles in Concealed Scenes -- Empirical Study on \"Segment\n  Anything\"","summary":"  Segmenting anything is a ground-breaking step toward artificial general\nintelligence, and the Segment Anything Model (SAM) greatly fosters the\nfoundation models for computer vision. We could not be more excited to probe\nthe performance traits of SAM. In particular, exploring situations in which SAM\ndoes not perform well is interesting. In this report, we choose three concealed\nscenes, i.e., camouflaged animals, industrial defects, and medical lesions, to\nevaluate SAM under unprompted settings. Our main observation is that SAM looks\nunskilled in concealed scenes.\n","authors":["Ge-Peng Ji","Deng-Ping Fan","Peng Xu","Ming-Ming Cheng","Bowen Zhou","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2304.06022v4.pdf","comment":"Accepted by SCIENCE CHINA Information Sciences, 2023"},{"id":"http://arxiv.org/abs/2305.10924v3","updated":"2023-09-30T12:05:20Z","published":"2023-05-18T12:38:21Z","title":"Structural Pruning for Diffusion Models","summary":"  Generative modeling has recently undergone remarkable advancements, primarily\npropelled by the transformative implications of Diffusion Probabilistic Models\n(DPMs). The impressive capability of these models, however, often entails\nsignificant computational overhead during both training and inference. To\ntackle this challenge, we present Diff-Pruning, an efficient compression method\ntailored for learning lightweight diffusion models from pre-existing ones,\nwithout the need for extensive re-training. The essence of Diff-Pruning is\nencapsulated in a Taylor expansion over pruned timesteps, a process that\ndisregards non-contributory diffusion steps and ensembles informative gradients\nto identify important weights. Our empirical assessment, undertaken across\nseveral datasets highlights two primary benefits of our proposed method: 1)\nEfficiency: it enables approximately a 50\\% reduction in FLOPs at a mere 10\\%\nto 20\\% of the original training expenditure; 2) Consistency: the pruned\ndiffusion models inherently preserve generative behavior congruent with their\npre-trained models. Code is available at\n\\url{https://github.com/VainF/Diff-Pruning}.\n","authors":["Gongfan Fang","Xinyin Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10924v3.pdf","comment":"Preprint version"},{"id":"http://arxiv.org/abs/2305.19947v2","updated":"2023-09-30T10:40:18Z","published":"2023-05-31T15:33:16Z","title":"A Geometric Perspective on Diffusion Models","summary":"  Recent years have witnessed significant progress in developing effective\ntraining and fast sampling techniques for diffusion models. A remarkable\nadvancement is the use of stochastic differential equations (SDEs) and their\nmarginal-preserving ordinary differential equations (ODEs) to describe data\nperturbation and generative modeling in a unified framework. In this paper, we\ncarefully inspect the ODE-based sampling of a popular variance-exploding SDE\nand reveal several intriguing structures of its sampling dynamics. We discover\nthat the data distribution and the noise distribution are smoothly connected\nwith a quasi-linear sampling trajectory and another implicit denoising\ntrajectory that even converges faster. Meanwhile, the denoising trajectory\ngoverns the curvature of the corresponding sampling trajectory and its various\nfinite differences yield all second-order samplers used in practice.\nFurthermore, we establish a theoretical relationship between the optimal\nODE-based sampling and the classic mean-shift (mode-seeking) algorithm, with\nwhich we can characterize the asymptotic behavior of diffusion models and\nidentify the empirical score deviation.\n","authors":["Defang Chen","Zhenyu Zhou","Jian-Ping Mei","Chunhua Shen","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2305.19947v2.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2307.04081v2","updated":"2023-09-30T09:35:46Z","published":"2023-07-09T01:41:22Z","title":"Score-based Conditional Generation with Fewer Labeled Data by\n  Self-calibrating Classifier Guidance","summary":"  Score-based generative models (SGMs) are a popular family of deep generative\nmodels that achieve leading image generation quality. Early studies extend SGMs\nto tackle class-conditional generation by coupling an unconditional SGM with\nthe guidance of a trained classifier. Nevertheless, such classifier-guided SGMs\ndo not always achieve accurate conditional generation, especially when trained\nwith fewer labeled data. We argue that the problem is rooted in the\nclassifier's tendency to overfit without coordinating with the underlying\nunconditional distribution. We propose improving classifier-guided SGMs by\nletting the classifier regularize itself to respect the unconditional\ndistribution. Our key idea is to use principles from energy-based models to\nconvert the classifier as another view of the unconditional SGM. Then, existing\nloss for the unconditional SGM can be leveraged to achieve regularization by\ncalibrating the classifier's internal unconditional scores. The regularization\nscheme can be applied to not only the labeled data but also unlabeled ones to\nfurther improve the classifier. Empirical results show that the proposed\napproach significantly improves conditional generation quality across various\npercentages of fewer labeled data. The results confirm the potential of the\nproposed approach for generative modeling with limited labeled data.\n","authors":["Paul Kuo-Ming Huang","Si-An Chen","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2307.04081v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15583v3","updated":"2023-09-30T09:19:37Z","published":"2023-05-24T21:39:27Z","title":"Alleviating Exposure Bias in Diffusion Models through Sampling with\n  Shifted Time Steps","summary":"  Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the\nsynthesis of high-quality images. However, their inference process\ncharacteristically requires numerous, potentially hundreds, of iterative steps,\nwhich could exaggerate the problem of exposure bias due to the training and\ninference discrepancy. Previous work has attempted to mitigate this issue by\nperturbing inputs during training, which consequently mandates the retraining\nof the DPM. In this work, we conduct a systematic study of exposure bias in DPM\nand, intriguingly, we find that the exposure bias could be alleviated with a\nnovel sampling method that we propose, without retraining the model. We\nempirically and theoretically show that, during inference, for each backward\ntime step $t$ and corresponding state $\\hat{x}_t$, there might exist another\ntime step $t_s$ which exhibits superior coupling with $\\hat{x}_t$. Based on\nthis finding, we introduce a sampling method named Time-Shift Sampler. Our\nframework can be seamlessly integrated to existing sampling algorithms, such as\nDDPM, DDIM and other high-order solvers, inducing merely minimal additional\ncomputations. Experimental results show our method brings significant and\nconsistent improvements in FID scores on different datasets and sampling\nmethods. For example, integrating Time-Shift Sampler to F-PNDM yields a\nFID=3.88, achieving 44.49\\% improvements as compared to F-PNDM, on CIFAR-10\nwith 10 sampling steps, which is more performant than the vanilla DDIM with 100\nsampling steps. We will release the code upon acceptance.\n","authors":["Mingxiao Li","Tingyu Qu","Ruicong Yao","Wei Sun","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2305.15583v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03049v2","updated":"2023-09-30T08:41:42Z","published":"2023-09-06T14:43:58Z","title":"Adaptive Growth: Real-time CNN Layer Expansion","summary":"  Deep Neural Networks (DNNs) have shown unparalleled achievements in numerous\napplications, reflecting their proficiency in managing vast data sets. Yet,\ntheir static structure limits their adaptability in ever-changing environments.\nThis research presents a new algorithm that allows the convolutional layer of a\nConvolutional Neural Network (CNN) to dynamically evolve based on data input,\nwhile still being seamlessly integrated into existing DNNs. Instead of a rigid\narchitecture, our approach iteratively introduces kernels to the convolutional\nlayer, gauging its real-time response to varying data. This process is refined\nby evaluating the layer's capacity to discern image features, guiding its\ngrowth. Remarkably, our unsupervised method has outstripped its supervised\ncounterparts across diverse datasets like MNIST, Fashion-MNIST, CIFAR-10, and\nCIFAR-100. It also showcases enhanced adaptability in transfer learning\nscenarios. By introducing a data-driven model scalability strategy, we are\nfilling a void in deep learning, leading to more flexible and efficient DNNs\nsuited for dynamic settings.\nCode:(https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version).\n","authors":["Yunjie Zhu","Yunhao Chen"],"pdf_url":"https://arxiv.org/pdf/2309.03049v2.pdf","comment":"Code:\n  https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version"},{"id":"http://arxiv.org/abs/2301.00190v2","updated":"2023-09-30T08:31:19Z","published":"2022-12-31T12:57:09Z","title":"Tracking Passengers and Baggage Items using Multiple Overhead Cameras at\n  Security Checkpoints","summary":"  We introduce a novel framework to track multiple objects in overhead camera\nvideos for airport checkpoint security scenarios where targets correspond to\npassengers and their baggage items. We propose a Self-Supervised Learning (SSL)\ntechnique to provide the model information about instance segmentation\nuncertainty from overhead images. Our SSL approach improves object detection by\nemploying a test-time data augmentation and a regression-based,\nrotation-invariant pseudo-label refinement technique. Our pseudo-label\ngeneration method provides multiple geometrically-transformed images as inputs\nto a Convolutional Neural Network (CNN), regresses the augmented detections\ngenerated by the network to reduce localization errors, and then clusters them\nusing the mean-shift algorithm. The self-supervised detector model is used in a\nsingle-camera tracking algorithm to generate temporal identifiers for the\ntargets. Our method also incorporates a multi-view trajectory association\nmechanism to maintain consistent temporal identifiers as passengers travel\nacross camera views. An evaluation of detection, tracking, and association\nperformances on videos obtained from multiple overhead cameras in a realistic\nairport checkpoint environment demonstrates the effectiveness of the proposed\napproach. Our results show that self-supervision improves object detection\naccuracy by up to $42\\%$ without increasing the inference time of the model.\nOur multi-camera association method achieves up to $89\\%$ multi-object tracking\naccuracy with an average computation time of less than $15$ ms.\n","authors":["Abubakar Siddique","Henry Medeiros"],"pdf_url":"https://arxiv.org/pdf/2301.00190v2.pdf","comment":"Need to replace already published arxiv version of this work. This\n  work will be the latest version of the previously published arXiv:2007.07924"},{"id":"http://arxiv.org/abs/2306.08889v2","updated":"2023-09-30T08:10:26Z","published":"2023-06-15T06:45:46Z","title":"Revealing the Illusion of Joint Multimodal Understanding in VideoQA\n  Models","summary":"  While VideoQA Transformer models demonstrate competitive performance on\nstandard benchmarks, the reasons behind their success are not fully understood.\nDo these models jointly capture and leverage the rich multimodal structures and\ndynamics from video and text? Or are they merely exploiting shortcuts to\nachieve high scores? Hence, we design $\\textit{QUAG}$ (QUadrant AveraGe), a\nlightweight and non-parametric probe, to critically analyze multimodal\nrepresentations. QUAG facilitates combined dataset-model study by systematic\nablation of model's coupled multimodal understanding during inference.\nSurprisingly, it demonstrates that the models manage to maintain high\nperformance even under multimodal impairment. We extend QUAG to design\n\"QUAG-attention\", a simplistic and less-expressive replacement of\nself-attention. We find that the models with QUAG-attention achieve similar\nperformance with significantly less mulops without any finetuning. These\nfindings indicate that the current VideoQA benchmarks and metrics do not\npenalize models that find shortcuts and discount joint multimodal\nunderstanding. Motivated by this, we propose the $\\textit{CLAVI}$\n(Counterfactual in LAnguage and VIdeo), a diagnostic dataset for coupled\nmultimodal understanding in VideoQA. CLAVI consists of temporal questions and\nvideos that are augmented to curate balanced counterfactuals in language and\nvideo domains. We evaluate models on CLAVI and find that all models achieve\nhigh performance on multimodal shortcut instances, but most of them have poor\nperformance on the counterfactual instances that necessitate joint multimodal\nunderstanding. Overall, with the multimodal representation analysis using QUAG\nand diagnostic analysis using CLAVI, we show that many VideoQA models are\nincapable of learning multimodal representations and that their success on\nstandard datasets is an illusion of joint multimodal understanding.\n","authors":["Ishaan Singh Rawal","Shantanu Jaiswal","Basura Fernando","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2306.08889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.08582v4","updated":"2023-09-30T07:01:56Z","published":"2022-01-21T08:02:55Z","title":"SegTransVAE: Hybrid CNN -- Transformer with Regularization for medical\n  image segmentation","summary":"  Current research on deep learning for medical image segmentation exposes\ntheir limitations in learning either global semantic information or local\ncontextual information. To tackle these issues, a novel network named\nSegTransVAE is proposed in this paper. SegTransVAE is built upon\nencoder-decoder architecture, exploiting transformer with the variational\nautoencoder (VAE) branch to the network to reconstruct the input images jointly\nwith segmentation. To the best of our knowledge, this is the first method\ncombining the success of CNN, transformer, and VAE. Evaluation on various\nrecently introduced datasets shows that SegTransVAE outperforms previous\nmethods in Dice Score and $95\\%$-Haudorff Distance while having comparable\ninference time to a simple CNN-based architecture network. The source code is\navailable at: https://github.com/itruonghai/SegTransVAE.\n","authors":["Quan-Dung Pham","Hai Nguyen-Truong","Nam Nguyen Phuong","Khoa N. A. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2201.08582v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.00340v4","updated":"2023-09-30T06:36:57Z","published":"2021-07-31T23:23:35Z","title":"Reconstruction guided Meta-learning for Few Shot Open Set Recognition","summary":"  In many applications, we are constrained to learn classifiers from very\nlimited data (few-shot classification). The task becomes even more challenging\nif it is also required to identify samples from unknown categories (open-set\nclassification). Learning a good abstraction for a class with very few samples\nis extremely difficult, especially under open-set settings. As a result,\nopen-set recognition has received minimal attention in the few-shot setting.\nHowever, it is a critical task in many applications like environmental\nmonitoring, where the number of labeled examples for each class is limited.\nExisting few-shot open-set recognition (FSOSR) methods rely on thresholding\nschemes, with some considering uniform probability for open-class samples.\nHowever, this approach is often inaccurate, especially for fine-grained\ncategorization, and makes them highly sensitive to the choice of a threshold.\nTo address these concerns, we propose Reconstructing Exemplar-based Few-shot\nOpen-set ClaSsifier (ReFOCS). By using a novel exemplar reconstruction-based\nmeta-learning strategy ReFOCS streamlines FSOSR eliminating the need for a\ncarefully tuned threshold by learning to be self-aware of the openness of a\nsample. The exemplars, act as class representatives and can be either provided\nin the training dataset or estimated in the feature domain. By testing on a\nwide variety of datasets, we show ReFOCS to outperform multiple\nstate-of-the-art methods.\n","authors":["Sayak Nag","Dripta S. Raychaudhuri","Sujoy Paul","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2108.00340v4.pdf","comment":"Accepted for publication in IEEE Transactions in Pattern Analysis and\n  Machine Intelligence (TPAMI)"},{"id":"http://arxiv.org/abs/2307.10173v2","updated":"2023-09-30T06:24:23Z","published":"2023-07-19T17:58:03Z","title":"DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity\n  Human-centric Rendering","summary":"  Realistic human-centric rendering plays a key role in both computer vision\nand computer graphics. Rapid progress has been made in the algorithm aspect\nover the years, yet existing human-centric rendering datasets and benchmarks\nare rather impoverished in terms of diversity, which are crucial for rendering\neffect. Researchers are usually constrained to explore and evaluate a small set\nof rendering problems on current datasets, while real-world applications\nrequire methods to be robust across different scenarios. In this work, we\npresent DNA-Rendering, a large-scale, high-fidelity repository of human\nperformance data for neural actor rendering. DNA-Rendering presents several\nalluring attributes. First, our dataset contains over 1500 human subjects, 5000\nmotion sequences, and 67.5M frames' data volume. Second, we provide rich assets\nfor each subject -- 2D/3D human body keypoints, foreground masks, SMPLX models,\ncloth/accessory materials, multi-view images, and videos. These assets boost\nthe current method's accuracy on downstream rendering tasks. Third, we\nconstruct a professional multi-view system to capture data, which contains 60\nsynchronous cameras with max 4096 x 3000 resolution, 15 fps speed, and stern\ncamera calibration steps, ensuring high-quality resources for task training and\nevaluation. Along with the dataset, we provide a large-scale and quantitative\nbenchmark in full-scale, with multiple tasks to evaluate the existing progress\nof novel view synthesis, novel pose animation synthesis, and novel identity\nrendering methods. In this manuscript, we describe our DNA-Rendering effort as\na revealing of new observations, challenges, and future directions to\nhuman-centric rendering. The dataset, code, and benchmarks will be publicly\navailable at https://dna-rendering.github.io/\n","authors":["Wei Cheng","Ruixiang Chen","Wanqi Yin","Siming Fan","Keyu Chen","Honglin He","Huiwen Luo","Zhongang Cai","Jingbo Wang","Yang Gao","Zhengming Yu","Zhengyu Lin","Daxuan Ren","Lei Yang","Ziwei Liu","Chen Change Loy","Chen Qian","Wayne Wu","Dahua Lin","Bo Dai","Kwan-Yee Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10173v2.pdf","comment":"This paper is accepted by ICCV2023. Project page:\n  https://dna-rendering.github.io/"},{"id":"http://arxiv.org/abs/2306.04344v2","updated":"2023-09-30T05:55:55Z","published":"2023-06-07T11:18:53Z","title":"ViDA: Homeostatic Visual Domain Adapter for Continual Test Time\n  Adaptation","summary":"  Since real-world machine systems are running in non-stationary environments,\nContinual Test-Time Adaptation (CTTA) task is proposed to adapt the pre-trained\nmodel to continually changing target domains. Recently, existing methods mainly\nfocus on model-based adaptation, which aims to leverage a self-training manner\nto extract the target domain knowledge. However, pseudo labels can be noisy and\nthe updated model parameters are unreliable under dynamic data distributions,\nleading to error accumulation and catastrophic forgetting in the continual\nadaptation process. To tackle these challenges and maintain the model\nplasticity, we tactfully design a Visual Domain Adapter (ViDA) for CTTA,\nexplicitly handling both domain-specific and domain-shared knowledge.\nSpecifically, we first comprehensively explore the different domain\nrepresentations of the adapters with trainable high-rank or low-rank embedding\nspaces. Then we inject ViDAs into the pre-trained model, which leverages\nhigh-rank and low-rank features to adapt the current domain distribution and\nmaintain the continual domain-shared knowledge, respectively. To exploit the\nlow-rank and high-rank ViDAs more effectively, we further propose a Homeostatic\nKnowledge Allotment (HKA) strategy, which adaptively combines different\nknowledge from each ViDA. Extensive experiments conducted on four widely used\nbenchmarks demonstrate that our proposed method achieves state-of-the-art\nperformance in both classification and segmentation CTTA tasks. Note that, our\nmethod can be regarded as a novel transfer paradigm for large-scale models,\ndelivering promising results in adaptation to continually changing\ndistributions.\n","authors":["Jiaming Liu","Senqiao Yang","Peidong Jia","Renrui Zhang","Ming Lu","Yandong Guo","Wei Xue","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.04344v2.pdf","comment":"Neurips2023 final Rating: Weak Accept; Weak Accept; Borderline\n  accept; Borderline accept"},{"id":"http://arxiv.org/abs/2211.13579v3","updated":"2023-09-30T04:41:21Z","published":"2022-11-24T13:08:43Z","title":"Knowledge-Aware Federated Active Learning with Non-IID Data","summary":"  Federated learning enables multiple decentralized clients to learn\ncollaboratively without sharing the local training data. However, the expensive\nannotation cost to acquire data labels on local clients remains an obstacle in\nutilizing local data. In this paper, we propose a federated active learning\nparadigm to efficiently learn a global model with limited annotation budget\nwhile protecting data privacy in a decentralized learning way. The main\nchallenge faced by federated active learning is the mismatch between the active\nsampling goal of the global model on the server and that of the asynchronous\nlocal clients. This becomes even more significant when data is distributed\nnon-IID across local clients. To address the aforementioned challenge, we\npropose Knowledge-Aware Federated Active Learning (KAFAL), which consists of\nKnowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory\nFederated Update (KCFU). KSAS is a novel active sampling method tailored for\nthe federated active learning problem. It deals with the mismatch challenge by\nsampling actively based on the discrepancies between local and global models.\nKSAS intensifies specialized knowledge in local clients, ensuring the sampled\ndata to be informative for both the local clients and the global model. KCFU,\nin the meantime, deals with the client heterogeneity caused by limited data and\nnon-IID data distributions. It compensates for each client's ability in weak\nclasses by the assistance of the global model. Extensive experiments and\nanalyses are conducted to show the superiority of KSAS over the\nstate-of-the-art active learning methods and the efficiency of KCFU under the\nfederated active learning framework.\n","authors":["Yu-Tong Cao","Ye Shi","Baosheng Yu","Jingya Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.13579v3.pdf","comment":"14 pages, 12 figures, ICCV23"},{"id":"http://arxiv.org/abs/2308.04152v3","updated":"2023-09-30T04:13:17Z","published":"2023-08-08T09:32:43Z","title":"Fine-tuning Multimodal LLMs to Follow Zero-shot Demonstrative\n  Instructions","summary":"  Recent advancements in Multimodal Large Language Models (MLLMs) have been\nutilizing Visual Prompt Generators (VPGs) to convert visual features into\ntokens that LLMs can recognize. This is achieved by training the VPGs on\nmillions of image-caption pairs, where the VPG-generated tokens of images are\nfed into a frozen LLM to generate the corresponding captions. However, this\nimage-captioning based training objective inherently biases the VPG to\nconcentrate solely on the primary visual contents sufficient for caption\ngeneration, often neglecting other visual details. This shortcoming results in\nMLLMs' underperformance in comprehending demonstrative instructions consisting\nof multiple, interleaved, and multimodal instructions that demonstrate the\nrequired context to complete a task. To address this issue, we introduce a\ngeneric and lightweight Visual Prompt Generator Complete module (VPG-C), which\ncan infer and complete the missing details essential for comprehending\ndemonstrative instructions. Further, we propose a synthetic discriminative\ntraining strategy to fine-tune VPG-C, eliminating the need for supervised\ndemonstrative instructions. As for evaluation, we build DEMON, a comprehensive\nbenchmark for demonstrative instruction understanding. Synthetically trained\nwith the proposed strategy, VPG-C achieves significantly stronger zero-shot\nperformance across all tasks of DEMON. Further evaluation on the MME and\nOwlEval benchmarks also demonstrate the superiority of VPG-C. Our benchmark,\ncode, and pre-trained models are available at\nhttps://github.com/DCDmllm/Cheetah.\n","authors":["Juncheng Li","Kaihang Pan","Zhiqi Ge","Minghe Gao","Hanwang Zhang","Wei Ji","Wenqiao Zhang","Tat-Seng Chua","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.04152v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11521v3","updated":"2023-09-30T02:41:27Z","published":"2022-05-23T17:59:58Z","title":"From Hours to Seconds: Towards 100x Faster Quantitative Phase Imaging\n  via Differentiable Microscopy","summary":"  With applications ranging from metabolomics to histopathology, quantitative\nphase microscopy (QPM) is a powerful label-free imaging modality. Despite\nsignificant advances in fast multiplexed imaging sensors and\ndeep-learning-based inverse solvers, the throughput of QPM is currently limited\nby the speed of electronic hardware. Complementarily, to improve throughput\nfurther, here we propose to acquire images in a compressed form such that more\ninformation can be transferred beyond the existing electronic hardware\nbottleneck. To this end, we present a learnable optical\ncompression-decompression framework that learns content-specific features. The\nproposed differentiable quantitative phase microscopy ($\\partial \\mu$) first\nuses learnable optical feature extractors as image compressors. The intensity\nrepresentation produced by these networks is then captured by the imaging\nsensor. Finally, a reconstruction network running on electronic hardware\ndecompresses the QPM images. In numerical experiments, the proposed system\nachieves compression of $\\times$ 64 while maintaining the SSIM of $\\sim 0.90$\nand PSNR of $\\sim 30$ dB on cells. The results demonstrated by our experiments\nopen up a new pathway for achieving end-to-end optimized (i.e., optics and\nelectronic) compact QPM systems that may provide unprecedented throughput\nimprovements.\n","authors":["Udith Haputhanthri","Kithmini Herath","Ramith Hettiarachchi","Hasindu Kariyawasam","Azeem Ahmad","Balpreet S. Ahluwalia","Chamira U. S. Edussooriya","Dushan N. Wadduwage"],"pdf_url":"https://arxiv.org/pdf/2205.11521v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09001v2","updated":"2023-09-30T01:50:38Z","published":"2023-06-15T09:56:33Z","title":"SSCBench: Monocular 3D Semantic Scene Completion Benchmark in Street\n  Views","summary":"  Monocular scene understanding is a foundational component of autonomous\nsystems. Within the spectrum of monocular perception topics, one crucial and\nuseful task for holistic 3D scene understanding is semantic scene completion\n(SSC), which jointly completes semantic information and geometric details from\nRGB input. However, progress in SSC, particularly in large-scale street views,\nis hindered by the scarcity of high-quality datasets. To address this issue, we\nintroduce SSCBench, a comprehensive benchmark that integrates scenes from\nwidely used automotive datasets (e.g., KITTI-360, nuScenes, and Waymo).\nSSCBench follows an established setup and format in the community, facilitating\nthe easy exploration of SSC methods in various street views. We benchmark\nmodels using monocular, trinocular, and point cloud input to assess the\nperformance gap resulting from sensor coverage and modality. Moreover, we have\nunified semantic labels across diverse datasets to simplify cross-domain\ngeneralization testing. We commit to including more datasets and SSC models to\ndrive further advancements in this field.\n","authors":["Yiming Li","Sihang Li","Xinhao Liu","Moonjun Gong","Kenan Li","Nuo Chen","Zijun Wang","Zhiheng Li","Tao Jiang","Fisher Yu","Yue Wang","Hang Zhao","Zhiding Yu","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2306.09001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04546v5","updated":"2023-09-30T01:10:44Z","published":"2023-04-10T12:37:26Z","title":"Kinship Representation Learning with Face Componential Relation","summary":"  Kinship recognition aims to determine whether the subjects in two facial\nimages are kin or non-kin, which is an emerging and challenging problem.\nHowever, most previous methods focus on heuristic designs without considering\nthe spatial correlation between face images. In this paper, we aim to learn\ndiscriminative kinship representations embedded with the relation information\nbetween face components (e.g., eyes, nose, etc.). To achieve this goal, we\npropose the Face Componential Relation Network, which learns the relationship\nbetween face components among images with a cross-attention mechanism, which\nautomatically learns the important facial regions for kinship recognition.\nMoreover, we propose Face Componential Relation Network (FaCoRNet), which\nadapts the loss function by the guidance from cross-attention to learn more\ndiscriminative feature representations. The proposed FaCoRNet outperforms\nprevious state-of-the-art methods by large margins for the largest public\nkinship recognition FIW benchmark.\n","authors":["Weng-Tai Su","Min-Hung Chen","Chien-Yi Wang","Shang-Hong Lai","Trista Pei-Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2304.04546v5.pdf","comment":"ICCV 2023 Workshop (Analysis and Modeling of Faces and Gestures)"},{"id":"http://arxiv.org/abs/2306.11180v3","updated":"2023-09-30T00:45:58Z","published":"2023-06-19T22:07:20Z","title":"Hyperbolic Active Learning for Semantic Segmentation under Domain Shift","summary":"  We introduce a hyperbolic neural network approach to pixel-level active\nlearning for semantic segmentation, and propose a novel geometric\ninterpretation of the hyperbolic geometry that arises bottom-up from the\nstatistics of the data. In our formulation the hyperbolic radius emerges as an\nestimator of the unexplained class complexity, which encompasses the class\nintrinsic complexity and its scarcity in the dataset. The unexplained class\ncomplexity serves as a metric indicating the likelihood that acquiring a\nparticular pixel would contribute to enhancing the data information. We combine\nthis quantity with prediction uncertainty to compute an acquisition score that\nidentifies the most informative pixels for oracle annotation. Our proposed HALO\n(Hyperbolic Active Learning Optimization) sets a new state-of-the-art in active\nlearning for semantic segmentation under domain shift, and surpasses the\nsupervised domain adaptation performance while only using a small portion of\nlabels (i.e., 1%). We perform extensive experimental analysis based on two\nestablished benchmarks, i.e. GTAV $\\rightarrow$ Cityscapes and SYNTHIA\n$\\rightarrow$ Cityscapes, and we additionally test on Cityscape $\\rightarrow$\nACDC under adverse weather conditions.\n","authors":["Luca Franco","Paolo Mandica","Konstantinos Kallidromitis","Devin Guillory","Yu-Teng Li","Trevor Darrell","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2306.11180v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.09277v2","updated":"2023-09-30T08:49:02Z","published":"2023-09-17T13:51:25Z","title":"Fairness for All: Investigating Harms to Within-Group Individuals in\n  Producer Fairness Re-ranking Optimization -- A Reproducibility Study","summary":"  Recommender systems are widely used to provide personalized recommendations\nto users. Recent research has shown that recommender systems may be subject to\ndifferent types of biases, such as popularity bias, leading to an uneven\ndistribution of recommendation exposure among producer groups. To mitigate\nthis, producer-centered fairness re-ranking (PFR) approaches have been proposed\nto ensure equitable recommendation utility across groups. However, these\napproaches overlook the harm they may cause to within-group individuals\nassociated with colder items, which are items with few or no interactions.\n  This study reproduces previous PFR approaches and shows that they\nsignificantly harm colder items, leading to a fairness gap for these items in\nboth advantaged and disadvantaged groups. Surprisingly, the unfair base\nrecommendation models were providing greater exposure opportunities to these\nindividual cold items, even though at the group level, they appeared to be\nunfair. To address this issue, the study proposes an amendment to the PFR\napproach that regulates the number of colder items recommended by the system.\nThis modification achieves a balance between accuracy and producer fairness\nwhile optimizing the selection of colder items within each group, thereby\npreventing or reducing harm to within-group individuals and augmenting the\nnovelty of all recommended items. The proposed method is able to register an\nincrease in sub-group fairness (SGF) from 0.3104 to 0.3782, 0.6156, and 0.9442\nwhile also improving group-level fairness (GF) (112% and 37% with respect to\nbase models and traditional PFR). Moreover, the proposed method achieves these\nimprovements with minimal or no reduction in accuracy (or even an increase\nsometimes). We evaluate the proposed method on various recommendation datasets\nand demonstrate promising results independent of the underlying model or\ndatasets.\n","authors":["Giovanni Pellegrini","Vittorio Maria Faraco","Yashar Deldjoo"],"pdf_url":"https://arxiv.org/pdf/2309.09277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13050v3","updated":"2023-09-30T07:30:10Z","published":"2023-06-22T17:17:45Z","title":"Data augmentation and refinement for recommender system: A\n  semi-supervised approach using maximum margin matrix factorization","summary":"  Collaborative filtering (CF) has become a popular method for developing\nrecommender systems (RSs) where ratings of a user for new items are predicted\nbased on her past preferences and available preference information of other\nusers. Despite the popularity of CF-based methods, their performance is often\ngreatly limited by the sparsity of observed entries. In this study, we explore\nthe data augmentation and refinement aspects of Maximum Margin Matrix\nFactorization (MMMF), a widely accepted CF technique for rating predictions,\nwhich has not been investigated before. We exploit the inherent characteristics\nof CF algorithms to assess the confidence level of individual ratings and\npropose a semi-supervised approach for rating augmentation based on\nself-training. We hypothesize that any CF algorithm's predictions with low\nconfidence are due to some deficiency in the training data and hence, the\nperformance of the algorithm can be improved by adopting a systematic data\naugmentation strategy. We iteratively use some of the ratings predicted with\nhigh confidence to augment the training data and remove low-confidence entries\nthrough a refinement process. By repeating this process, the system learns to\nimprove prediction accuracy. Our method is experimentally evaluated on several\nstate-of-the-art CF algorithms and leads to informative rating augmentation,\nimproving the performance of the baseline approaches.\n","authors":["Shamal Shaikh","Venkateswara Rao Kagita","Vikas Kumar","Arun K Pujari"],"pdf_url":"https://arxiv.org/pdf/2306.13050v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2310.00410v1","updated":"2023-09-30T15:14:50Z","published":"2023-09-30T15:14:50Z","title":"Open-Domain Dialogue Quality Evaluation: Deriving Nugget-level Scores\n  from Turn-level Scores","summary":"  Existing dialogue quality evaluation systems can return a score for a given\nsystem turn from a particular viewpoint, e.g., engagingness. However, to\nimprove dialogue systems by locating exactly where in a system turn potential\nproblems lie, a more fine-grained evaluation may be necessary. We therefore\npropose an evaluation approach where a turn is decomposed into nuggets (i.e.,\nexpressions associated with a dialogue act), and nugget-level evaluation is\nenabled by leveraging an existing turn-level evaluation system. We demonstrate\nthe potential effectiveness of our evaluation method through a case study.\n","authors":["Rikiya Takehi","Akihisa Watanabe","Tetsuya Sakai"],"pdf_url":"https://arxiv.org/pdf/2310.00410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00402v1","updated":"2023-09-30T14:55:44Z","published":"2023-09-30T14:55:44Z","title":"DiskANN++: Efficient Page-based Search over Isomorphic Mapped Graph\n  Index using Query-sensitivity Entry Vertex","summary":"  Given a vector dataset $\\mathcal{X}$ and a query vector $\\vec{x}_q$,\ngraph-based Approximate Nearest Neighbor Search (ANNS) aims to build a graph\nindex $G$ and approximately return vectors with minimum distances to\n$\\vec{x}_q$ by searching over $G$. The main drawback of graph-based ANNS is\nthat a graph index would be too large to fit into the memory especially for a\nlarge-scale $\\mathcal{X}$. To solve this, a Product Quantization (PQ)-based\nhybrid method called DiskANN is proposed to store a low-dimensional PQ index in\nmemory and retain a graph index in SSD, thus reducing memory overhead while\nensuring a high search accuracy. However, it suffers from two I/O issues that\nsignificantly affect the overall efficiency: (1) long routing path from an\nentry vertex to the query's neighborhood that results in large number of I/O\nrequests and (2) redundant I/O requests during the routing process. We propose\nan optimized DiskANN++ to overcome above issues. Specifically, for the first\nissue, we present a query-sensitive entry vertex selection strategy to replace\nDiskANN's static graph-central entry vertex by a dynamically determined entry\nvertex that is close to the query. For the second I/O issue, we present an\nisomorphic mapping on DiskANN's graph index to optimize the SSD layout and\npropose an asynchronously optimized Pagesearch based on the optimized SSD\nlayout as an alternative to DiskANN's beamsearch. Comprehensive experimental\nstudies on eight real-world datasets demonstrate our DiskANN++'s superiority on\nefficiency. We achieve a notable 1.5 X to 2.2 X improvement on QPS compared to\nDiskANN, given the same accuracy constraint.\n","authors":["Jiongkang Ni","Xiaoliang Xu","Yuxiang Wang","Can Li","Jiajie Yao","Shihai Xiao","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.00402v1.pdf","comment":"14 pages including references, 9 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.14331v2","updated":"2023-09-30T23:55:41Z","published":"2023-09-25T17:56:54Z","title":"LinGCN: Structural Linearized Graph Convolutional Network for\n  Homomorphically Encrypted Inference","summary":"  The growth of Graph Convolution Network (GCN) model sizes has revolutionized\nnumerous applications, surpassing human performance in areas such as personal\nhealthcare and financial systems. The deployment of GCNs in the cloud raises\nprivacy concerns due to potential adversarial attacks on client data. To\naddress security concerns, Privacy-Preserving Machine Learning (PPML) using\nHomomorphic Encryption (HE) secures sensitive client data. However, it\nintroduces substantial computational overhead in practical applications. To\ntackle those challenges, we present LinGCN, a framework designed to reduce\nmultiplication depth and optimize the performance of HE based GCN inference.\nLinGCN is structured around three key elements: (1) A differentiable structural\nlinearization algorithm, complemented by a parameterized discrete indicator\nfunction, co-trained with model weights to meet the optimization goal. This\nstrategy promotes fine-grained node-level non-linear location selection,\nresulting in a model with minimized multiplication depth. (2) A compact\nnode-wise polynomial replacement policy with a second-order trainable\nactivation function, steered towards superior convergence by a two-level\ndistillation approach from an all-ReLU based teacher model. (3) an enhanced HE\nsolution that enables finer-grained operator fusion for node-wise activation\nfunctions, further reducing multiplication level consumption in HE-based\ninference. Our experiments on the NTU-XVIEW skeleton joint dataset reveal that\nLinGCN excels in latency, accuracy, and scalability for homomorphically\nencrypted inference, outperforming solutions such as CryptoGCN. Remarkably,\nLinGCN achieves a 14.2x latency speedup relative to CryptoGCN, while preserving\nan inference accuracy of 75% and notably reducing multiplication depth.\n","authors":["Hongwu Peng","Ran Ran","Yukui Luo","Jiahui Zhao","Shaoyi Huang","Kiran Thorat","Tong Geng","Chenghong Wang","Xiaolin Xu","Wujie Wen","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2309.14331v2.pdf","comment":"NeurIPS 2023 accepted publication"},{"id":"http://arxiv.org/abs/2305.10616v3","updated":"2023-09-30T22:44:48Z","published":"2023-05-18T00:04:38Z","title":"Evaluation Metrics for DNNs Compression","summary":"  There is a lot of ongoing research effort into developing different\ntechniques for neural networks compression. However, the community lacks\nstandardised evaluation metrics, which are key to identifying the most suitable\ncompression technique for different applications. This paper reviews existing\nneural network compression evaluation metrics and implements them into a\nstandardisation framework called NetZIP. We introduce two novel metrics to\ncover existing gaps of evaluation in the literature: 1) Compression and\nHardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success\n(OCS). We demonstrate the use of NetZIP using two case studies on two different\nhardware platforms (a PC and a Raspberry Pi 4) focusing on object\nclassification and object detection.\n","authors":["Abanoub Ghobrial","Samuel Budgett","Dieter Balemans","Hamid Asgari","Phil Reiter","Kerstin Eder"],"pdf_url":"https://arxiv.org/pdf/2305.10616v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14937v2","updated":"2023-09-30T21:07:16Z","published":"2022-09-29T16:54:53Z","title":"NAG-GS: Semi-Implicit, Accelerated and Robust Stochastic Optimizer","summary":"  Classical machine learning models such as deep neural networks are usually\ntrained by using Stochastic Gradient Descent-based (SGD) algorithms. The\nclassical SGD can be interpreted as a discretization of the stochastic gradient\nflow. In this paper we propose a novel, robust and accelerated stochastic\noptimizer that relies on two key elements: (1) an accelerated Nesterov-like\nStochastic Differential Equation (SDE) and (2) its semi-implicit Gauss-Seidel\ntype discretization. The convergence and stability of the obtained method,\nreferred to as NAG-GS, are first studied extensively in the case of the\nminimization of a quadratic function. This analysis allows us to come up with\nan optimal learning rate in terms of the convergence rate while ensuring the\nstability of NAG-GS. This is achieved by the careful analysis of the spectral\nradius of the iteration matrix and the covariance matrix at stationarity with\nrespect to all hyperparameters of our method. Further, we show that NAG- GS is\ncompetitive with state-of-the-art methods such as momentum SGD with weight\ndecay and AdamW for the training of machine learning models such as the\nlogistic regression model, the residual networks models on standard computer\nvision datasets, Transformers in the frame of the GLUE benchmark and the recent\nVision Transformers.\n","authors":["Valentin Leplat","Daniil Merkulov","Aleksandr Katrutsa","Daniel Bershatsky","Olga Tsymboi","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2209.14937v2.pdf","comment":"We study Nesterov acceleration for the Stochastic Differential\n  Equation"},{"id":"http://arxiv.org/abs/2202.05135v5","updated":"2023-09-30T20:32:37Z","published":"2022-02-10T16:40:59Z","title":"Group-Agent Reinforcement Learning","summary":"  It can largely benefit the reinforcement learning (RL) process of each agent\nif multiple geographically distributed agents perform their separate RL tasks\ncooperatively. Different from multi-agent reinforcement learning (MARL) where\nmultiple agents are in a common environment and should learn to cooperate or\ncompete with each other, in this case each agent has its separate environment\nand only communicates with others to share knowledge without any cooperative or\ncompetitive behaviour as a learning outcome. In fact, this scenario exists\nwidely in real life whose concept can be utilised in many applications, but is\nnot well understood yet and not well formulated. As the first effort, we\npropose group-agent system for RL as a formulation of this scenario and the\nthird type of RL system with respect to single-agent and multi-agent systems.\nWe then propose a distributed RL framework called DDAL (Decentralised\nDistributed Asynchronous Learning) designed for group-agent reinforcement\nlearning (GARL). We show through experiments that DDAL achieved desirable\nperformance with very stable training and has good scalability.\n","authors":["Kaiyue Wu","Xiao-Jun Zeng"],"pdf_url":"https://arxiv.org/pdf/2202.05135v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00952v2","updated":"2023-09-30T19:35:49Z","published":"2023-06-01T17:49:58Z","title":"Meta-Learning Framework for End-to-End Imposter Identification in Unseen\n  Speaker Recognition","summary":"  Speaker identification systems are deployed in diverse environments, often\ndifferent from the lab conditions on which they are trained and tested. In this\npaper, first, we show the problem of generalization using fixed thresholds\n(computed using EER metric) for imposter identification in unseen speaker\nrecognition and then introduce a robust speaker-specific thresholding technique\nfor better performance. Secondly, inspired by the recent use of meta-learning\ntechniques in speaker verification, we propose an end-to-end meta-learning\nframework for imposter detection which decouples the problem of imposter\ndetection from unseen speaker identification. Thus, unlike most prior works\nthat use some heuristics to detect imposters, the proposed network learns to\ndetect imposters by leveraging the utterances of the enrolled speakers.\nFurthermore, we show the efficacy of the proposed techniques on VoxCeleb1, VCTK\nand the FFSVC 2022 datasets, beating the baselines by up to 10%.\n","authors":["Ashutosh Chaubey","Sparsh Sinha","Susmita Ghose"],"pdf_url":"https://arxiv.org/pdf/2306.00952v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03985v2","updated":"2023-09-30T19:07:04Z","published":"2023-08-08T02:03:47Z","title":"Fourier neural operator for real-time simulation of 3D dynamic urban\n  microclimate","summary":"  Global urbanization has underscored the significance of urban microclimates\nfor human comfort, health, and building/urban energy efficiency. They\nprofoundly influence building design and urban planning as major environmental\nimpacts. Understanding local microclimates is essential for cities to prepare\nfor climate change and effectively implement resilience measures. However,\nanalyzing urban microclimates requires considering a complex array of outdoor\nparameters within computational domains at the city scale over a longer period\nthan indoors. As a result, numerical methods like Computational Fluid Dynamics\n(CFD) become computationally expensive when evaluating the impact of urban\nmicroclimates. The rise of deep learning techniques has opened new\nopportunities for accelerating the modeling of complex non-linear interactions\nand system dynamics. Recently, the Fourier Neural Operator (FNO) has been shown\nto be very promising in accelerating solving the Partial Differential Equations\n(PDEs) and modeling fluid dynamic systems. In this work, we apply the FNO\nnetwork for real-time three-dimensional (3D) urban wind field simulation. The\ntraining and testing data are generated from CFD simulation of the urban area,\nbased on the semi-Lagrangian approach and fractional stepping method to\nsimulate urban microclimate features for modeling large-scale urban problems.\nNumerical experiments show that the FNO model can accurately reconstruct the\ninstantaneous spatial velocity field. We further evaluate the trained FNO model\non unseen data with different wind directions, and the results show that the\nFNO model can generalize well on different wind directions. More importantly,\nthe FNO approach can make predictions within milliseconds on the graphics\nprocessing unit, making real-time simulation of 3D dynamic urban microclimate\npossible.\n","authors":["Wenhui Peng","Shaoxiang Qin","Senwen Yang","Jianchun Wang","Xue Liu","Liangzhu Leon Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.01302v3","updated":"2023-09-30T18:36:42Z","published":"2022-03-02T18:40:00Z","title":"Evolving Curricula with Regret-Based Environment Design","summary":"  It remains a significant challenge to train generally capable agents with\nreinforcement learning (RL). A promising avenue for improving the robustness of\nRL agents is through the use of curricula. One such class of methods frames\nenvironment design as a game between a student and a teacher, using\nregret-based objectives to produce environment instantiations (or levels) at\nthe frontier of the student agent's capabilities. These methods benefit from\ntheir generality, with theoretical guarantees at equilibrium, yet they often\nstruggle to find effective levels in challenging design spaces. By contrast,\nevolutionary approaches seek to incrementally alter environment complexity,\nresulting in potentially open-ended learning, but often rely on domain-specific\nheuristics and vast amounts of computational resources. In this paper we\npropose to harness the power of evolution in a principled, regret-based\ncurriculum. Our approach, which we call Adversarially Compounding Complexity by\nEditing Levels (ACCEL), seeks to constantly produce levels at the frontier of\nan agent's capabilities, resulting in curricula that start simple but become\nincreasingly complex. ACCEL maintains the theoretical benefits of prior\nregret-based methods, while providing significant empirical gains in a diverse\nset of environments. An interactive version of the paper is available at\naccelagent.github.io.\n","authors":["Jack Parker-Holder","Minqi Jiang","Michael Dennis","Mikayel Samvelyan","Jakob Foerster","Edward Grefenstette","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2203.01302v3.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2209.13482v2","updated":"2023-09-30T18:25:51Z","published":"2022-09-27T15:55:23Z","title":"Predicting Swarm Equatorial Plasma Bubbles via Machine Learning and\n  Shapley Values","summary":"  In this study we present AI Prediction of Equatorial Plasma Bubbles (APE), a\nmachine learning model that can accurately predict the Ionospheric Bubble Index\n(IBI) on the Swarm spacecraft. IBI is a correlation ($R^2$) between\nperturbations in plasma density and the magnetic field, whose source can be\nEquatorial Plasma Bubbles (EPBs). EPBs have been studied for a number of years,\nbut their day-to-day variability has made predicting them a considerable\nchallenge. We build an ensemble machine learning model to predict IBI. We use\ndata from 2014-22 at a resolution of 1sec, and transform it from a time-series\ninto a 6-dimensional space with a corresponding EPB $R^2$ (0-1) acting as the\nlabel. APE performs well across all metrics, exhibiting a skill, association\nand root mean squared error score of 0.96, 0.98 and 0.08 respectively. The\nmodel performs best post-sunset, in the American/Atlantic sector, around the\nequinoxes, and when solar activity is high. This is promising because EPBs are\nmost likely to occur during these periods. Shapley values reveal that F10.7 is\nthe most important feature in driving the predictions, whereas latitude is the\nleast. The analysis also examines the relationship between the features, which\nreveals new insights into EPB climatology. Finally, the selection of the\nfeatures means that APE could be expanded to forecasting EPBs following\nadditional investigations into their onset.\n","authors":["S. A. Reddy","C. Forsyth","A. Aruliah","A. Smith","J. Bortnik","E. Aa","D. O. Kataria","G. Lewis"],"pdf_url":"https://arxiv.org/pdf/2209.13482v2.pdf","comment":"13 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2304.07665v2","updated":"2023-09-30T18:19:04Z","published":"2023-04-16T01:40:48Z","title":"Dynamic Exploration-Exploitation Trade-Off in Active Learning Regression\n  with Bayesian Hierarchical Modeling","summary":"  Active learning provides a framework to adaptively query the most informative\nexperiments towards learning an unknown black-box function. Various approaches\nof active learning have been proposed in the literature, however, they either\nfocus on exploration or exploitation in the design space. Methods that do\nconsider exploration-exploitation simultaneously employ fixed or ad-hoc\nmeasures to control the trade-off that may not be optimal. In this paper, we\ndevelop a Bayesian hierarchical approach, referred as BHEEM, to dynamically\nbalance the exploration-exploitation trade-off as more data points are queried.\nTo sample from the posterior distribution of the trade-off parameter, We\nsubsequently formulate an approximate Bayesian computation approach based on\nthe linear dependence of queried data in the feature space. Simulated and\nreal-world examples show the proposed approach achieves at least 21% and 11%\naverage improvement when compared to pure exploration and exploitation\nstrategies respectively. More importantly, we note that by optimally balancing\nthe trade-off between exploration and exploitation, BHEEM performs better or at\nleast as well as either pure exploration or pure exploitation.\n","authors":["Upala Junaida Islam","Kamran Paynabar","George Runger","Ashif Sikandar Iquebal"],"pdf_url":"https://arxiv.org/pdf/2304.07665v2.pdf","comment":"30 pages, 10 figures, 0 table, submitted to IISE Transaction"},{"id":"http://arxiv.org/abs/2303.12981v3","updated":"2023-09-30T17:58:43Z","published":"2023-03-23T01:14:36Z","title":"Connected Superlevel Set in (Deep) Reinforcement Learning and its\n  Application to Minimax Theorems","summary":"  The aim of this paper is to improve the understanding of the optimization\nlandscape for policy optimization problems in reinforcement learning.\nSpecifically, we show that the superlevel set of the objective function with\nrespect to the policy parameter is always a connected set both in the tabular\nsetting and under policies represented by a class of neural networks. In\naddition, we show that the optimization objective as a function of the policy\nparameter and reward satisfies a stronger \"equiconnectedness\" property. To our\nbest knowledge, these are novel and previously unknown discoveries.\n  We present an application of the connectedness of these superlevel sets to\nthe derivation of minimax theorems for robust reinforcement learning. We show\nthat any minimax optimization program which is convex on one side and is\nequiconnected on the other side observes the minimax equality (i.e. has a Nash\nequilibrium). We find that this exact structure is exhibited by an interesting\nrobust reinforcement learning problem under an adversarial reward attack, and\nthe validity of its minimax equality immediately follows. This is the first\ntime such a result is established in the literature.\n","authors":["Sihan Zeng","Thinh T. Doan","Justin Romberg"],"pdf_url":"https://arxiv.org/pdf/2303.12981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12206v2","updated":"2023-09-30T17:38:49Z","published":"2023-03-21T21:42:03Z","title":"Policy Optimization for Personalized Interventions in Behavioral Health","summary":"  Behavioral health interventions, delivered through digital platforms, have\nthe potential to significantly improve health outcomes, through education,\nmotivation, reminders, and outreach. We study the problem of optimizing\npersonalized interventions for patients to maximize a long-term outcome, where\ninterventions are costly and capacity-constrained. We assume there exists a\ndataset collected from an initial pilot study that we can leverage. We present\na new approach for this problem that we dub DecompPI, which approximates one\nstep of policy iteration. Implementing DecompPI simply consists of a prediction\ntask using the dataset, alleviating the need for online experimentation.\nDecompPI is a generic model-free algorithm that can be used irrespective of the\nunderlying patient behavior model. We derive theoretical guarantees on a\nsimple, special case of the model that is representative of our problem\nsetting. We establish an approximation ratio for DecompPI with respect to the\nimprovement beyond a null policy that does not allocate interventions.\nSpecifically, when the initial policy used to collect the data is randomized,\nthe approximation ratio of the improvement approaches 1/2 as the intervention\ncapacity of the initial policy decreases. We show that this guarantee is robust\nto estimation errors. We conduct a rigorous empirical case study using\nreal-world data from a mobile health platform for improving treatment adherence\nfor tuberculosis. Using a validated simulation model, we demonstrate that\nDecompPI can provide the same efficacy as the status quo approach with\napproximately half the capacity of interventions. DecompPI is simple and easy\nto implement for organizations aiming to improve long-term behavior through\ntargeted interventions, and this paper demonstrates its strong performance both\ntheoretically and empirically.\n","authors":["Jackie Baek","Justin J. Boutilier","Vivek F. Farias","Jonas Oddur Jonasson","Erez Yoeli"],"pdf_url":"https://arxiv.org/pdf/2303.12206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.02626v3","updated":"2023-09-30T17:09:57Z","published":"2021-06-04T17:39:36Z","title":"Dynamics of specialization in neural modules under resource constraints","summary":"  It has long been believed that the brain is highly modular both in terms of\nstructure and function, although recent evidence has led some to question the\nextent of both types of modularity. We used artificial neural networks to test\nthe hypothesis that structural modularity is sufficient to guarantee functional\nspecialization, and find that in general, this doesn't necessarily hold except\nat extreme levels. We then systematically tested which features of the\nenvironment and network do lead to the emergence of specialization. We used a\nsimple toy environment, task and network, allowing us precise control, and show\nthat in this setup, several distinct measures of specialization give\nqualitatively similar results. We further find that (1) specialization can only\nemerge in environments where features of that environment are meaningfully\nseparable, (2) specialization preferentially emerges when the network is\nstrongly resource-constrained, and (3) these findings are qualitatively similar\nacross different network architectures, but the quantitative relationships\ndepends on the architecture type. Finally, we show that functional\nspecialization varies dynamically across time, and demonstrate that these\ndynamics depend on both the timing and bandwidth of information flow in the\nnetwork. We conclude that a static notion of specialization, based on\nstructural modularity, is likely too simple a framework for understanding\nintelligence in situations of real-world complexity, from biology to\nbrain-inspired neuromorphic systems. We propose that thoroughly stress testing\ncandidate definitions of functional modularity in simplified scenarios before\nextending to more complex data, network models and electrophysiological\nrecordings is likely to be a fruitful approach.\n","authors":["Gabriel Béna","Dan F. M. Goodman"],"pdf_url":"https://arxiv.org/pdf/2106.02626v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09178v5","updated":"2023-09-30T15:52:10Z","published":"2022-12-18T21:53:33Z","title":"Support Vector Regression: Risk Quadrangle Framework","summary":"  This paper investigates Support Vector Regression (SVR) in the context of the\nfundamental risk quadrangle theory, which links optimization, risk management,\nand statistical estimation. It is shown that both formulations of SVR,\n$\\varepsilon$-SVR and $\\nu$-SVR, correspond to the minimization of equivalent\nerror measures (Vapnik error and CVaR norm, respectively) with a regularization\npenalty. These error measures, in turn, define the corresponding risk\nquadrangles. By constructing the fundamental risk quadrangle, which corresponds\nto SVR, we show that SVR is the asymptotically unbiased estimator of the\naverage of two symmetric conditional quantiles. Further, we prove the\nequivalence of the $\\varepsilon$-SVR and $\\nu$-SVR in a general stochastic\nsetting. Additionally, SVR is formulated as a regular deviation minimization\nproblem with a regularization penalty. Finally, the dual formulation of SVR in\nthe risk quadrangle framework is derived.\n","authors":["Anton Malandii","Stan Uryasev"],"pdf_url":"https://arxiv.org/pdf/2212.09178v5.pdf","comment":"Incomplete result"},{"id":"http://arxiv.org/abs/2309.11054v2","updated":"2023-09-30T15:51:38Z","published":"2023-09-20T04:17:28Z","title":"Design of Chain-of-Thought in Math Problem Solving","summary":"  Chain-of-Thought (CoT) plays a crucial role in reasoning for math problem\nsolving. We conduct a comprehensive examination of methods for designing CoT,\ncomparing conventional natural language CoT with various program CoTs,\nincluding the self-describing program, the comment-describing program, and the\nnon-describing program. Furthermore, we investigate the impact of programming\nlanguage on program CoTs, comparing Python and Wolfram Language. Through\nextensive experiments on GSM8K, MATHQA, and SVAMP, we find that program CoTs\noften have superior effectiveness in math problem solving. Notably, the best\nperforming combination with 30B parameters beats GPT-3.5-turbo by a significant\nmargin. The results show that self-describing program offers greater diversity\nand thus can generally achieve higher performance. We also find that Python is\na better choice of language than Wolfram for program CoTs. The experimental\nresults provide a valuable guideline for future CoT designs that take into\naccount both programming language and coding style for further advancements.\nOur datasets and code are publicly available.\n","authors":["Zhanming Jie","Trung Quoc Luong","Xinbo Zhang","Xiaoran Jin","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2309.11054v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2303.13006v2","updated":"2023-09-30T15:29:50Z","published":"2023-03-23T03:02:09Z","title":"Controllable Inversion of Black-Box Face Recognition Models via\n  Diffusion","summary":"  Face recognition models embed a face image into a low-dimensional identity\nvector containing abstract encodings of identity-specific facial features that\nallow individuals to be distinguished from one another. We tackle the\nchallenging task of inverting the latent space of pre-trained face recognition\nmodels without full model access (i.e. black-box setting). A variety of methods\nhave been proposed in literature for this task, but they have serious\nshortcomings such as a lack of realistic outputs and strong requirements for\nthe data set and accessibility of the face recognition model. By analyzing the\nblack-box inversion problem, we show that the conditional diffusion model loss\nnaturally emerges and that we can effectively sample from the inverse\ndistribution even without an identity-specific loss. Our method, named identity\ndenoising diffusion probabilistic model (ID3PM), leverages the stochastic\nnature of the denoising diffusion process to produce high-quality,\nidentity-preserving face images with various backgrounds, lighting, poses, and\nexpressions. We demonstrate state-of-the-art performance in terms of identity\npreservation and diversity both qualitatively and quantitatively, and our\nmethod is the first black-box face recognition model inversion method that\noffers intuitive control over the generation process.\n","authors":["Manuel Kansy","Anton Raël","Graziana Mignone","Jacek Naruniec","Christopher Schroers","Markus Gross","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2303.13006v2.pdf","comment":"8 pages main paper + 23 pages supplementary material. Moderate\n  revisions from v1 (different template, added user study, wording). Presented\n  at AMFG workshop at ICCV 2023. Project page:\n  https://studios.disneyresearch.com/2023/10/02/controllable-inversion-of-black-box-face-recognition-models-via-diffusion/"},{"id":"http://arxiv.org/abs/2301.11443v3","updated":"2023-09-30T15:24:11Z","published":"2023-01-26T22:17:00Z","title":"Limitless stability for Graph Convolutional Networks","summary":"  This work establishes rigorous, novel and widely applicable stability\nguarantees and transferability bounds for graph convolutional networks --\nwithout reference to any underlying limit object or statistical distribution.\nCrucially, utilized graph-shift operators (GSOs) are not necessarily assumed to\nbe normal, allowing for the treatment of networks on both undirected- and for\nthe first time also directed graphs. Stability to node-level perturbations is\nrelated to an 'adequate (spectral) covering' property of the filters in each\nlayer. Stability to edge-level perturbations is related to Lipschitz constants\nand newly introduced semi-norms of filters. Results on stability to topological\nperturbations are obtained through recently developed mathematical-physics\nbased tools. As an important and novel example, it is showcased that graph\nconvolutional networks are stable under graph-coarse-graining procedures\n(replacing strongly-connected sub-graphs by single nodes) precisely if the GSO\nis the graph Laplacian and filters are regular at infinity. These new\ntheoretical results are supported by corresponding numerical investigations.\n","authors":["Christian Koke"],"pdf_url":"https://arxiv.org/pdf/2301.11443v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10310v4","updated":"2023-09-30T14:38:44Z","published":"2023-03-18T02:42:18Z","title":"Domain-knowledge Inspired Pseudo Supervision (DIPS) for Unsupervised\n  Image-to-Image Translation Models to Support Cross-Domain Classification","summary":"  The ability to classify images is dependent on having access to large labeled\ndatasets and testing on data from the same domain that the model can train on.\nClassification becomes more challenging when dealing with new data from a\ndifferent domain, where gathering and especially labeling a larger image\ndataset for retraining a classification model requires a labor-intensive human\neffort. Cross-domain classification frameworks were developed to handle this\ndata domain shift problem by utilizing unsupervised image-to-image translation\nmodels to translate an input image from the unlabeled domain to the labeled\ndomain. The problem with these unsupervised models lies in their unsupervised\nnature. For lack of annotations, it is not possible to use the traditional\nsupervised metrics to evaluate these translation models to pick the best-saved\ncheckpoint model. This paper introduces a new method called Domain-knowledge\nInspired Pseudo Supervision (DIPS) which utilizes domain-informed Gaussian\nMixture Models to generate pseudo annotations to enable the use of traditional\nsupervised metrics. This method was designed specifically to support\ncross-domain classification applications contrary to other typically used\nmetrics such as the FID which were designed to evaluate the model in terms of\nthe quality of the generated image from a human-eye perspective. DIPS proves\nits effectiveness by outperforming various GAN evaluation metrics, including\nFID, when selecting the optimal saved checkpoint model. It is also evaluated\nagainst truly supervised metrics. Furthermore, DIPS showcases its robustness\nand interpretability by demonstrating a strong correlation with truly\nsupervised metrics, highlighting its superiority over existing state-of-the-art\nalternatives. The code and data to replicate the results can be found on the\nofficial Github repository: https://github.com/Hindawi91/DIPS\n","authors":["Firas Al-Hindawi","Md Mahfuzur Rahman Siddiquee","Teresa Wu","Han Hu","Ying Sun"],"pdf_url":"https://arxiv.org/pdf/2303.10310v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.09107"},{"id":"http://arxiv.org/abs/2305.15798v2","updated":"2023-09-30T13:58:51Z","published":"2023-05-25T07:28:28Z","title":"On Architectural Compression of Text-to-Image Diffusion Models","summary":"  Exceptional text-to-image (T2I) generation results of Stable Diffusion models\n(SDMs) come with substantial computational demands. To resolve this issue,\nrecent research on efficient SDMs has prioritized reducing the number of\nsampling steps and utilizing network quantization. Orthogonal to these\ndirections, this study highlights the power of classical architectural\ncompression for general-purpose T2I synthesis by introducing block-removed\nknowledge-distilled SDMs (BK-SDMs). We eliminate several residual and attention\nblocks from the U-Net of SDMs, obtaining over a 30% reduction in the number of\nparameters, MACs per sampling step, and latency. We conduct distillation-based\npretraining with only 0.22M LAION pairs (fewer than 0.1% of the full training\npairs) on a single A100 GPU. Despite being trained with limited resources, our\ncompact models can imitate the original SDM by benefiting from transferred\nknowledge and achieve competitive results against larger multi-billion\nparameter models on the zero-shot MS-COCO benchmark. Moreover, we demonstrate\nthe applicability of our lightweight pretrained models in personalized\ngeneration with DreamBooth finetuning. Code and models can be found at:\nhttps://github.com/Nota-NetsPresso/BK-SDM\n","authors":["Bo-Kyeong Kim","Hyoung-Kyu Song","Thibault Castells","Shinkook Choi"],"pdf_url":"https://arxiv.org/pdf/2305.15798v2.pdf","comment":"Updated results: mobile inference, different training data volumes,\n  and pruning sensitivity analysis; Short version: accepted to ICML Workshop on\n  ES-FoMo (2023)"},{"id":"http://arxiv.org/abs/2302.07477v3","updated":"2023-09-30T13:31:31Z","published":"2023-02-15T05:43:17Z","title":"Optimal Sample Complexity of Reinforcement Learning for Mixing\n  Discounted Markov Decision Processes","summary":"  We consider the optimal sample complexity theory of tabular reinforcement\nlearning (RL) for maximizing the infinite horizon discounted reward in a Markov\ndecision process (MDP). Optimal worst-case complexity results have been\ndeveloped for tabular RL problems in this setting, leading to a sample\ncomplexity dependence on $\\gamma$ and $\\epsilon$ of the form $\\tilde\n\\Theta((1-\\gamma)^{-3}\\epsilon^{-2})$, where $\\gamma$ denotes the discount\nfactor and $\\epsilon$ is the solution error tolerance. However, in many\napplications of interest, the optimal policy (or all policies) induces mixing.\nWe establish that in such settings, the optimal sample complexity dependence is\n$\\tilde \\Theta(t_{\\text{mix}}(1-\\gamma)^{-2}\\epsilon^{-2})$, where\n$t_{\\text{mix}}$ is the total variation mixing time. Our analysis is grounded\nin regeneration-type ideas, which we believe are of independent interest, as\nthey can be used to study RL problems for general state space MDPs.\n","authors":["Shengbo Wang","Jose Blanchet","Peter Glynn"],"pdf_url":"https://arxiv.org/pdf/2302.07477v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01639v2","updated":"2023-09-30T12:33:13Z","published":"2023-05-02T17:52:58Z","title":"Privacy-Preserving In-Context Learning for Large Language Models","summary":"  In-context learning (ICL) is an important capability of Large Language Models\n(LLMs), enabling these models to dynamically adapt based on specific,\nin-context exemplars, thereby improving accuracy and relevance. However, LLM's\nresponses may leak the sensitive private information contained in in-context\nexemplars. To address this challenge, we propose Differentially Private\nIn-context Learning (DP-ICL), a general paradigm for privatizing ICL tasks. The\nkey idea for DP-ICL paradigm is generating differentially private responses\nthrough a noisy consensus among an ensemble of LLM's responses based on\ndisjoint exemplar sets. Based on the general paradigm of DP-ICL, we instantiate\nseveral techniques showing how to privatize ICL for text classification and\nlanguage generation. We evaluate DP-ICL on four text classification benchmarks\nand two language generation tasks, and our empirical results show that DP-ICL\nachieves a strong utility-privacy tradeoff.\n","authors":["Tong Wu","Ashwinee Panda","Jiachen T. Wang","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2305.01639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10924v3","updated":"2023-09-30T12:05:20Z","published":"2023-05-18T12:38:21Z","title":"Structural Pruning for Diffusion Models","summary":"  Generative modeling has recently undergone remarkable advancements, primarily\npropelled by the transformative implications of Diffusion Probabilistic Models\n(DPMs). The impressive capability of these models, however, often entails\nsignificant computational overhead during both training and inference. To\ntackle this challenge, we present Diff-Pruning, an efficient compression method\ntailored for learning lightweight diffusion models from pre-existing ones,\nwithout the need for extensive re-training. The essence of Diff-Pruning is\nencapsulated in a Taylor expansion over pruned timesteps, a process that\ndisregards non-contributory diffusion steps and ensembles informative gradients\nto identify important weights. Our empirical assessment, undertaken across\nseveral datasets highlights two primary benefits of our proposed method: 1)\nEfficiency: it enables approximately a 50\\% reduction in FLOPs at a mere 10\\%\nto 20\\% of the original training expenditure; 2) Consistency: the pruned\ndiffusion models inherently preserve generative behavior congruent with their\npre-trained models. Code is available at\n\\url{https://github.com/VainF/Diff-Pruning}.\n","authors":["Gongfan Fang","Xinyin Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10924v3.pdf","comment":"Preprint version"},{"id":"http://arxiv.org/abs/2306.14975v2","updated":"2023-09-30T10:41:46Z","published":"2023-06-26T18:01:47Z","title":"The Underlying Scaling Laws and Universal Statistical Structure of\n  Complex Datasets","summary":"  We study universal traits which emerge both in real-world complex datasets,\nas well as in artificially generated ones. Our approach is to analogize data to\na physical system and employ tools from statistical physics and Random Matrix\nTheory (RMT) to reveal their underlying structure. We focus on the\nfeature-feature covariance matrix, analyzing both its local and global\neigenvalue statistics. Our main observations are: (i) The power-law scalings\nthat the bulk of its eigenvalues exhibit are vastly different for uncorrelated\nnormally distributed data compared to real-world data, (ii) this scaling\nbehavior can be completely modeled by generating gaussian data with long range\ncorrelations, (iii) both generated and real-world datasets lie in the same\nuniversality class from the RMT perspective, as chaotic rather than integrable\nsystems, (iv) the expected RMT statistical behavior already manifests for\nempirical covariance matrices at dataset sizes significantly smaller than those\nconventionally used for real-world training, and can be related to the number\nof samples required to approximate the population power-law scaling behavior,\n(v) the Shannon entropy is correlated with local RMT structure and eigenvalues\nscaling, and substantially smaller in strongly correlated datasets compared to\nuncorrelated synthetic data, and requires fewer samples to reach the\ndistribution entropy. These findings show that with sufficient sample size, the\nGram matrix of natural image datasets can be well approximated by a Wishart\nrandom matrix with a simple covariance structure, opening the door to rigorous\nstudies of neural network dynamics and generalization which rely on the data\nGram matrix.\n","authors":["Noam Levi","Yaron Oz"],"pdf_url":"https://arxiv.org/pdf/2306.14975v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.19947v2","updated":"2023-09-30T10:40:18Z","published":"2023-05-31T15:33:16Z","title":"A Geometric Perspective on Diffusion Models","summary":"  Recent years have witnessed significant progress in developing effective\ntraining and fast sampling techniques for diffusion models. A remarkable\nadvancement is the use of stochastic differential equations (SDEs) and their\nmarginal-preserving ordinary differential equations (ODEs) to describe data\nperturbation and generative modeling in a unified framework. In this paper, we\ncarefully inspect the ODE-based sampling of a popular variance-exploding SDE\nand reveal several intriguing structures of its sampling dynamics. We discover\nthat the data distribution and the noise distribution are smoothly connected\nwith a quasi-linear sampling trajectory and another implicit denoising\ntrajectory that even converges faster. Meanwhile, the denoising trajectory\ngoverns the curvature of the corresponding sampling trajectory and its various\nfinite differences yield all second-order samplers used in practice.\nFurthermore, we establish a theoretical relationship between the optimal\nODE-based sampling and the classic mean-shift (mode-seeking) algorithm, with\nwhich we can characterize the asymptotic behavior of diffusion models and\nidentify the empirical score deviation.\n","authors":["Defang Chen","Zhenyu Zhou","Jian-Ping Mei","Chunhua Shen","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2305.19947v2.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2205.12841v4","updated":"2023-09-30T09:37:33Z","published":"2022-05-25T15:10:15Z","title":"Marginal Post Processing of Bayesian Inference Products with Normalizing\n  Flows and Kernel Density Estimators","summary":"  Bayesian analysis has become an indispensable tool across many different\ncosmological fields including the study of gravitational waves, the Cosmic\nMicrowave Background and the 21-cm signal from the Cosmic Dawn among other\nphenomena. The method provides a way to fit complex models to data describing\nkey cosmological and astrophysical signals and a whole host of contaminating\nsignals and instrumental effects modelled with `nuisance parameters'. In this\npaper, we summarise a method that uses Masked Autoregressive Flows and Kernel\nDensity Estimators to learn marginal posterior densities corresponding to core\nscience parameters. We find that the marginal or 'nuisance-free' posteriors and\nthe associated likelihoods have an abundance of applications including; the\ncalculation of previously intractable marginal Kullback-Leibler divergences and\nmarginal Bayesian Model Dimensionalities, likelihood emulation and prior\nemulation. We demonstrate each application using toy examples, examples from\nthe field of 21-cm cosmology and samples from the Dark Energy Survey. We\ndiscuss how marginal summary statistics like the Kullback-Leibler divergences\nand Bayesian Model Dimensionalities can be used to examine the constraining\npower of different experiments and how we can perform efficient joint analysis\nby taking advantage of marginal prior and likelihood emulators. We package our\nmultipurpose code up in the pip-installable code margarine for use in the wider\nscientific community.\n","authors":["Harry T. J. Bevins","William J. Handley","Pablo Lemos","Peter H. Sims","Eloy de Lera Acedo","Anastasia Fialkov","Justin Alsing"],"pdf_url":"https://arxiv.org/pdf/2205.12841v4.pdf","comment":"Accepted for MNRAS"},{"id":"http://arxiv.org/abs/2307.04081v2","updated":"2023-09-30T09:35:46Z","published":"2023-07-09T01:41:22Z","title":"Score-based Conditional Generation with Fewer Labeled Data by\n  Self-calibrating Classifier Guidance","summary":"  Score-based generative models (SGMs) are a popular family of deep generative\nmodels that achieve leading image generation quality. Early studies extend SGMs\nto tackle class-conditional generation by coupling an unconditional SGM with\nthe guidance of a trained classifier. Nevertheless, such classifier-guided SGMs\ndo not always achieve accurate conditional generation, especially when trained\nwith fewer labeled data. We argue that the problem is rooted in the\nclassifier's tendency to overfit without coordinating with the underlying\nunconditional distribution. We propose improving classifier-guided SGMs by\nletting the classifier regularize itself to respect the unconditional\ndistribution. Our key idea is to use principles from energy-based models to\nconvert the classifier as another view of the unconditional SGM. Then, existing\nloss for the unconditional SGM can be leveraged to achieve regularization by\ncalibrating the classifier's internal unconditional scores. The regularization\nscheme can be applied to not only the labeled data but also unlabeled ones to\nfurther improve the classifier. Empirical results show that the proposed\napproach significantly improves conditional generation quality across various\npercentages of fewer labeled data. The results confirm the potential of the\nproposed approach for generative modeling with limited labeled data.\n","authors":["Paul Kuo-Ming Huang","Si-An Chen","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2307.04081v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08375v2","updated":"2023-09-30T09:05:44Z","published":"2023-09-15T13:04:55Z","title":"Boosting Fair Classifier Generalization through Adaptive Priority\n  Reweighing","summary":"  With the increasing penetration of machine learning applications in critical\ndecision-making areas, calls for algorithmic fairness are more prominent.\nAlthough there have been various modalities to improve algorithmic fairness\nthrough learning with fairness constraints, their performance does not\ngeneralize well in the test set. A performance-promising fair algorithm with\nbetter generalizability is needed. This paper proposes a novel adaptive\nreweighing method to eliminate the impact of the distribution shifts between\ntraining and test data on model generalizability. Most previous reweighing\nmethods propose to assign a unified weight for each (sub)group. Rather, our\nmethod granularly models the distance from the sample predictions to the\ndecision boundary. Our adaptive reweighing method prioritizes samples closer to\nthe decision boundary and assigns a higher weight to improve the\ngeneralizability of fair classifiers. Extensive experiments are performed to\nvalidate the generalizability of our adaptive priority reweighing method for\naccuracy and fairness measures (i.e., equal opportunity, equalized odds, and\ndemographic parity) in tabular benchmarks. We also highlight the performance of\nour method in improving the fairness of language and vision models. The code is\navailable at https://github.com/che2198/APW.\n","authors":["Zhihao Hu","Yiran Xu","Mengnan Du","Jindong Gu","Xinmei Tian","Fengxiang He"],"pdf_url":"https://arxiv.org/pdf/2309.08375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11328v3","updated":"2023-09-30T08:41:04Z","published":"2023-04-22T06:06:28Z","title":"On Accelerating Diffusion-Based Sampling Process via Improved\n  Integration Approximation","summary":"  A popular approach to sample a diffusion-based generative model is to solve\nan ordinary differential equation (ODE). In existing samplers, the coefficients\nof the ODE solvers are pre-determined by the ODE formulation, the reverse\ndiscrete timesteps, and the employed ODE methods. In this paper, we consider\naccelerating several popular ODE-based sampling processes (including EDM, DDIM,\nand DPM-Solver) by optimizing certain coefficients via improved integration\napproximation (IIA). We propose to minimize, for each time step, a mean squared\nerror (MSE) function with respect to the selected coefficients. The MSE is\nconstructed by applying the original ODE solver for a set of fine-grained\ntimesteps, which in principle provides a more accurate integration\napproximation in predicting the next diffusion state. The proposed IIA\ntechnique does not require any change of a pre-trained model, and only\nintroduces a very small computational overhead for solving a number of\nquadratic optimization problems. Extensive experiments show that considerably\nbetter FID scores can be achieved by using IIA-EDM, IIA-DDIM, and\nIIA-DPM-Solver than the original counterparts when the neural function\nevaluation (NFE) is small (i.e., less than 25).\n","authors":["Guoqiang Zhang","Niwa Kenta","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2304.11328v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10007v2","updated":"2023-09-30T08:33:37Z","published":"2023-09-18T02:43:59Z","title":"Multi-Agent Deep Reinforcement Learning for Cooperative and Competitive\n  Autonomous Vehicles using AutoDRIVE Ecosystem","summary":"  This work presents a modular and parallelizable multi-agent deep\nreinforcement learning framework for imbibing cooperative as well as\ncompetitive behaviors within autonomous vehicles. We introduce AutoDRIVE\nEcosystem as an enabler to develop physically accurate and graphically\nrealistic digital twins of Nigel and F1TENTH, two scaled autonomous vehicle\nplatforms with unique qualities and capabilities, and leverage this ecosystem\nto train and deploy multi-agent reinforcement learning policies. We first\ninvestigate an intersection traversal problem using a set of cooperative\nvehicles (Nigel) that share limited state information with each other in single\nas well as multi-agent learning settings using a common policy approach. We\nthen investigate an adversarial head-to-head autonomous racing problem using a\ndifferent set of vehicles (F1TENTH) in a multi-agent learning setting using an\nindividual policy approach. In either set of experiments, a decentralized\nlearning architecture was adopted, which allowed robust training and testing of\nthe approaches in stochastic environments, since the agents were mutually\nindependent and exhibited asynchronous motion behavior. The problems were\nfurther aggravated by providing the agents with sparse observation spaces and\nrequiring them to sample control commands that implicitly satisfied the imposed\nkinodynamic as well as safety constraints. The experimental results for both\nproblem statements are reported in terms of quantitative metrics and\nqualitative remarks for training as well as deployment phases.\n","authors":["Tanmay Vilas Samak","Chinmay Vilas Samak","Venkat Krovi"],"pdf_url":"https://arxiv.org/pdf/2309.10007v2.pdf","comment":"Accepted as Multi-Agent Dynamic Games (MAD-Games) Workshop Paper at\n  IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)\n  2023"},{"id":"http://arxiv.org/abs/2305.10865v2","updated":"2023-09-30T08:27:28Z","published":"2023-05-18T10:37:54Z","title":"Semantically Aligned Task Decomposition in Multi-Agent Reinforcement\n  Learning","summary":"  The difficulty of appropriately assigning credit is particularly heightened\nin cooperative MARL with sparse reward, due to the concurrent time and\nstructural scales involved. Automatic subgoal generation (ASG) has recently\nemerged as a viable MARL approach inspired by utilizing subgoals in\nintrinsically motivated reinforcement learning. However, end-to-end learning of\ncomplex task planning from sparse rewards without prior knowledge, undoubtedly\nrequires massive training samples. Moreover, the diversity-promoting nature of\nexisting ASG methods can lead to the \"over-representation\" of subgoals,\ngenerating numerous spurious subgoals of limited relevance to the actual task\nreward and thus decreasing the sample efficiency of the algorithm. To address\nthis problem and inspired by the disentangled representation learning, we\npropose a novel \"disentangled\" decision-making method, Semantically Aligned\ntask decomposition in MARL (SAMA), that prompts pretrained language models with\nchain-of-thought that can suggest potential goals, provide suitable goal\ndecomposition and subgoal allocation as well as self-reflection-based\nreplanning. Additionally, SAMA incorporates language-grounded RL to train each\nagent's subgoal-conditioned policy. SAMA demonstrates considerable advantages\nin sample efficiency compared to state-of-the-art ASG methods, as evidenced by\nits performance on two challenging sparse-reward tasks, Overcooked and MiniRTS.\n","authors":["Wenhao Li","Dan Qiao","Baoxiang Wang","Xiangfeng Wang","Bo Jin","Hongyuan Zha"],"pdf_url":"https://arxiv.org/pdf/2305.10865v2.pdf","comment":"54 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.12030v2","updated":"2023-09-30T07:54:22Z","published":"2023-08-23T09:43:10Z","title":"Prompt-Based Length Controlled Generation with Reinforcement Learning","summary":"  Large language models (LLMs) like ChatGPT and GPT-4 have attracted great\nattention given their surprising performance on a wide range of NLP tasks.\nLength controlled generation of LLMs emerges as an important topic, which\nenables users to fully leverage the capability of LLMs in more real-world\nscenarios like generating a proper answer or essay of a desired length. In\naddition, the autoregressive generation in LLMs is extremely time-consuming,\nwhile the ability of controlling this generated length can reduce the inference\ncost by limiting the length. Therefore, we propose a prompt-based length\ncontrol method to achieve high-accuracy length controlled generation. In\nparticular, we adopt reinforcement learning with the reward signal given by\neither trainable or rule-based reward models, which further enhances the\nlength-control ability of LLMs by rewarding outputs that follows pre-defined\ncontrol instruction. To enable rule-based inference, we also introduce standard\nprompt extractor to collect the standard control information from users' input.\nExperiments show that our method significantly improves the accuracy of\nprompt-based length control for summarization task on popular datasets like\nCNNDM and NYT. Both the standard prompt extractor and the RL-tuned model have\nshow strong generalization ability to unseen control prompt templates.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13050v3","updated":"2023-09-30T07:30:10Z","published":"2023-06-22T17:17:45Z","title":"Data augmentation and refinement for recommender system: A\n  semi-supervised approach using maximum margin matrix factorization","summary":"  Collaborative filtering (CF) has become a popular method for developing\nrecommender systems (RSs) where ratings of a user for new items are predicted\nbased on her past preferences and available preference information of other\nusers. Despite the popularity of CF-based methods, their performance is often\ngreatly limited by the sparsity of observed entries. In this study, we explore\nthe data augmentation and refinement aspects of Maximum Margin Matrix\nFactorization (MMMF), a widely accepted CF technique for rating predictions,\nwhich has not been investigated before. We exploit the inherent characteristics\nof CF algorithms to assess the confidence level of individual ratings and\npropose a semi-supervised approach for rating augmentation based on\nself-training. We hypothesize that any CF algorithm's predictions with low\nconfidence are due to some deficiency in the training data and hence, the\nperformance of the algorithm can be improved by adopting a systematic data\naugmentation strategy. We iteratively use some of the ratings predicted with\nhigh confidence to augment the training data and remove low-confidence entries\nthrough a refinement process. By repeating this process, the system learns to\nimprove prediction accuracy. Our method is experimentally evaluated on several\nstate-of-the-art CF algorithms and leads to informative rating augmentation,\nimproving the performance of the baseline approaches.\n","authors":["Shamal Shaikh","Venkateswara Rao Kagita","Vikas Kumar","Arun K Pujari"],"pdf_url":"https://arxiv.org/pdf/2306.13050v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2109.08346v2","updated":"2023-09-30T07:12:33Z","published":"2021-09-17T04:48:42Z","title":"Comfetch: Federated Learning of Large Networks on Constrained Clients\n  via Sketching","summary":"  Federated learning (FL) is a popular paradigm for private and collaborative\nmodel training on the edge. In centralized FL, the parameters of a global\narchitecture (such as a deep neural network) are maintained and distributed by\na central server/controller to clients who transmit model updates (gradients)\nback to the server based on local optimization. While many efforts have focused\non reducing the communication complexity of gradient transmission, the vast\nmajority of compression-based algorithms assume that each participating client\nis able to download and train the current and full set of parameters, which may\nnot be a practical assumption depending on the resource constraints of smaller\nclients such as mobile devices. In this work, we propose a simple yet effective\nnovel algorithm, Comfetch, which allows clients to train large networks using\nreduced representations of the global architecture via the count sketch, which\nreduces local computational and memory costs along with bi-directional\ncommunication complexity. We provide a nonconvex convergence guarantee and\nexperimentally demonstrate that it is possible to learn large models, such as a\ndeep convolutional network, through federated training on their sketched\ncounterparts. The resulting global models exhibit competitive test accuracy\nover CIFAR10/100 classification when compared against un-compressed model\ntraining.\n","authors":["Tahseen Rabbani","Brandon Feng","Marco Bornstein","Kyle Rui Sang","Yifan Yang","Arjun Rajkumar","Amitabh Varshney","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2109.08346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01597v2","updated":"2023-09-30T05:53:03Z","published":"2023-07-04T09:38:38Z","title":"Unlocking the Potential of Deep Learning in Peak-Hour Series Forecasting","summary":"  Unlocking the potential of deep learning in Peak-Hour Series Forecasting\n(PHSF) remains a critical yet underexplored task in various domains. While\nstate-of-the-art deep learning models excel in regular Time Series Forecasting\n(TSF), they struggle to achieve comparable results in PHSF. This can be\nattributed to the challenges posed by the high degree of non-stationarity in\npeak-hour series, which makes direct forecasting more difficult than standard\nTSF. Additionally, manually extracting the maximum value from regular\nforecasting results leads to suboptimal performance due to models minimizing\nthe mean deficit. To address these issues, this paper presents Seq2Peak, a\nnovel framework designed specifically for PHSF tasks, bridging the performance\ngap observed in TSF models. Seq2Peak offers two key components: the CyclicNorm\npipeline to mitigate the non-stationarity issue and a simple yet effective\ntrainable-parameter-free peak-hour decoder with a hybrid loss function that\nutilizes both the original series and peak-hour series as supervised signals.\nExtensive experimentation on publicly available time series datasets\ndemonstrates the effectiveness of the proposed framework, yielding a remarkable\naverage relative improvement of 37.7% across four real-world datasets for both\ntransformer- and non-transformer-based TSF models.\n","authors":["Zhenwei Zhang","Xin Wang","Jingyuan Xie","Heling Zhang","Yuantao Gu"],"pdf_url":"https://arxiv.org/pdf/2307.01597v2.pdf","comment":"to be published in CIKM'23"},{"id":"http://arxiv.org/abs/2210.16656v2","updated":"2023-09-30T05:19:26Z","published":"2022-10-29T17:36:51Z","title":"Auxo: Efficient Federated Learning via Scalable Client Clustering","summary":"  Federated learning (FL) is an emerging machine learning (ML) paradigm that\nenables heterogeneous edge devices to collaboratively train ML models without\nrevealing their raw data to a logically centralized server. However, beyond the\nheterogeneous device capacity, FL participants often exhibit differences in\ntheir data distributions, which are not independent and identically distributed\n(Non-IID). Many existing works present point solutions to address issues like\nslow convergence, low final accuracy, and bias in FL, all stemming from client\nheterogeneity. In this paper, we explore an additional layer of complexity to\nmitigate such heterogeneity by grouping clients with statistically similar data\ndistributions (cohorts). We propose Auxo to gradually identify such cohorts in\nlarge-scale, low-availability, and resource-constrained FL populations. Auxo\nthen adaptively determines how to train cohort-specific models in order to\nachieve better model performance and ensure resource efficiency. Our extensive\nevaluations show that, by identifying cohorts with smaller heterogeneity and\nperforming efficient cohort-based training, Auxo boosts various existing FL\nsolutions in terms of final accuracy (2.1% - 8.2%), convergence time (up to\n2.2x), and model bias (4.8% - 53.8%).\n","authors":["Jiachen Liu","Fan Lai","Yinwei Dai","Aditya Akella","Harsha Madhyastha","Mosharaf Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2210.16656v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2308.03901v2","updated":"2023-09-30T04:50:40Z","published":"2023-08-07T20:28:22Z","title":"FLIPS: Federated Learning using Intelligent Participant Selection","summary":"  This paper presents the design and implementation of FLIPS, a middleware\nsystem to manage data and participant heterogeneity in federated learning (FL)\ntraining workloads. In particular, we examine the benefits of label\ndistribution clustering on participant selection in federated learning. FLIPS\nclusters parties involved in an FL training job based on the label distribution\nof their data apriori, and during FL training, ensures that each cluster is\nequitably represented in the participants selected. FLIPS can support the most\ncommon FL algorithms, including FedAvg, FedProx, FedDyn, FedOpt and FedYogi. To\nmanage platform heterogeneity and dynamic resource availability, FLIPS\nincorporates a straggler management mechanism to handle changing capacities in\ndistributed, smart community applications. Privacy of label distributions,\nclustering and participant selection is ensured through a trusted execution\nenvironment (TEE). Our comprehensive empirical evaluation compares FLIPS with\nrandom participant selection, as well as three other \"smart\" selection\nmechanisms - Oort, TiFL and gradient clustering using two real-world datasets,\ntwo benchmark datasets, two different non-IID distributions and three common FL\nalgorithms (FedYogi, FedProx and FedAvg). We demonstrate that FLIPS\nsignificantly improves convergence, achieving higher accuracy by 17 - 20 % with\n20 - 60 % lower communication costs, and these benefits endure in the presence\nof straggler participants.\n","authors":["Rahul Atul Bhope","K. R. Jayaram","Nalini Venkatasubramanian","Ashish Verma","Gegi Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.03901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13579v3","updated":"2023-09-30T04:41:21Z","published":"2022-11-24T13:08:43Z","title":"Knowledge-Aware Federated Active Learning with Non-IID Data","summary":"  Federated learning enables multiple decentralized clients to learn\ncollaboratively without sharing the local training data. However, the expensive\nannotation cost to acquire data labels on local clients remains an obstacle in\nutilizing local data. In this paper, we propose a federated active learning\nparadigm to efficiently learn a global model with limited annotation budget\nwhile protecting data privacy in a decentralized learning way. The main\nchallenge faced by federated active learning is the mismatch between the active\nsampling goal of the global model on the server and that of the asynchronous\nlocal clients. This becomes even more significant when data is distributed\nnon-IID across local clients. To address the aforementioned challenge, we\npropose Knowledge-Aware Federated Active Learning (KAFAL), which consists of\nKnowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory\nFederated Update (KCFU). KSAS is a novel active sampling method tailored for\nthe federated active learning problem. It deals with the mismatch challenge by\nsampling actively based on the discrepancies between local and global models.\nKSAS intensifies specialized knowledge in local clients, ensuring the sampled\ndata to be informative for both the local clients and the global model. KCFU,\nin the meantime, deals with the client heterogeneity caused by limited data and\nnon-IID data distributions. It compensates for each client's ability in weak\nclasses by the assistance of the global model. Extensive experiments and\nanalyses are conducted to show the superiority of KSAS over the\nstate-of-the-art active learning methods and the efficiency of KCFU under the\nfederated active learning framework.\n","authors":["Yu-Tong Cao","Ye Shi","Baosheng Yu","Jingya Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.13579v3.pdf","comment":"14 pages, 12 figures, ICCV23"},{"id":"http://arxiv.org/abs/2208.07978v2","updated":"2023-09-30T04:32:58Z","published":"2022-08-16T22:31:50Z","title":"Enhancing Heterogeneous Federated Learning with Knowledge Extraction and\n  Multi-Model Fusion","summary":"  Concerned with user data privacy, this paper presents a new federated\nlearning (FL) method that trains machine learning models on edge devices\nwithout accessing sensitive data. Traditional FL methods, although\nprivacy-protective, fail to manage model heterogeneity and incur high\ncommunication costs due to their reliance on aggregation methods. To address\nthis limitation, we propose a resource-aware FL method that aggregates local\nknowledge from edge models and distills it into robust global knowledge through\nknowledge distillation. This method allows efficient multi-model knowledge\nfusion and the deployment of resource-aware models while preserving model\nheterogeneity. Our method improves communication cost and performance in\nheterogeneous data and models compared to existing FL algorithms. Notably, it\nreduces the communication cost of ResNet-32 by up to 50\\% and VGG-11 by up to\n10$\\times$ while delivering superior performance.\n","authors":["Duy Phuong Nguyen","Sixing Yu","J. Pablo Muñoz","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2208.07978v2.pdf","comment":"Accept at the 4th workshop on Artificial Intelligence and Machine\n  Learning for Scientific Applications (AI4S), SC 23"},{"id":"http://arxiv.org/abs/2306.05032v2","updated":"2023-09-30T04:09:55Z","published":"2023-06-08T08:34:58Z","title":"Log-based Anomaly Detection based on EVT Theory with feedback","summary":"  System logs play a critical role in maintaining the reliability of software\nsystems. Fruitful studies have explored automatic log-based anomaly detection\nand achieved notable accuracy on benchmark datasets. However, when applied to\nlarge-scale cloud systems, these solutions face limitations due to high\nresource consumption and lack of adaptability to evolving logs. In this paper,\nwe present an accurate, lightweight, and adaptive log-based anomaly detection\nframework, referred to as SeaLog. Our method introduces a Trie-based Detection\nAgent (TDA) that employs a lightweight, dynamically-growing trie structure for\nreal-time anomaly detection. To enhance TDA's accuracy in response to evolving\nlog data, we enable it to receive feedback from experts. Interestingly, our\nfindings suggest that contemporary large language models, such as ChatGPT, can\nprovide feedback with a level of consistency comparable to human experts, which\ncan potentially reduce manual verification efforts. We extensively evaluate\nSeaLog on two public datasets and an industrial dataset. The results show that\nSeaLog outperforms all baseline methods in terms of effectiveness, runs 2X to\n10X faster and only consumes 5% to 41% of the memory resource.\n","authors":["Jinyang Liu","Junjie Huang","Yintong Huo","Zhihan Jiang","Jiazhen Gu","Zhuangbin Chen","Cong Feng","Minzhi Yan","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2306.05032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07513v3","updated":"2023-09-30T03:42:38Z","published":"2022-10-14T04:37:24Z","title":"Continuous-in-time Limit for Bayesian Bandits","summary":"  This paper revisits the bandit problem in the Bayesian setting. The Bayesian\napproach formulates the bandit problem as an optimization problem, and the goal\nis to find the optimal policy which minimizes the Bayesian regret. One of the\nmain challenges facing the Bayesian approach is that computation of the optimal\npolicy is often intractable, especially when the length of the problem horizon\nor the number of arms is large. In this paper, we first show that under a\nsuitable rescaling, the Bayesian bandit problem converges toward a continuous\nHamilton-Jacobi-Bellman (HJB) equation. The optimal policy for the limiting HJB\nequation can be explicitly obtained for several common bandit problems, and we\ngive numerical methods to solve the HJB equation when an explicit solution is\nnot available. Based on these results, we propose an approximate Bayes-optimal\npolicy for solving Bayesian bandit problems with large horizons. Our method has\nthe added benefit that its computational cost does not increase as the horizon\nincreases.\n","authors":["Yuhua Zhu","Zachary Izzo","Lexing Ying"],"pdf_url":"https://arxiv.org/pdf/2210.07513v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.09075v2","updated":"2023-09-30T20:42:32Z","published":"2023-09-16T18:48:40Z","title":"Music Generation based on Generative Adversarial Networks with\n  Transformer","summary":"  Autoregressive models based on Transformers have become the prevailing\napproach for generating music compositions that exhibit comprehensive musical\nstructure. These models are typically trained by minimizing the negative\nlog-likelihood (NLL) of the observed sequence in an autoregressive manner.\nHowever, when generating long sequences, the quality of samples from these\nmodels tends to significantly deteriorate due to exposure bias. To address this\nissue, we leverage classifiers trained to differentiate between real and\nsampled sequences to identify these failures. This observation motivates our\nexploration of adversarial losses as a complement to the NLL objective. We\nemploy a pre-trained Span-BERT model as the discriminator in the Generative\nAdversarial Network (GAN) framework, which enhances training stability in our\nexperiments. To optimize discrete sequences within the GAN framework, we\nutilize the Gumbel-Softmax trick to obtain a differentiable approximation of\nthe sampling process. Additionally, we partition the sequences into smaller\nchunks to ensure that memory constraints are met. Through human evaluations and\nthe introduction of a novel discriminative metric, we demonstrate that our\napproach outperforms a baseline model trained solely on likelihood\nmaximization.\n","authors":["Ziyi Jiang","Ruoxue Wu","Zhenghan Chen","Xiaoxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2309.09075v2.pdf","comment":"co-author want to withdraw"},{"id":"http://arxiv.org/abs/2310.00455v1","updated":"2023-09-30T18:27:14Z","published":"2023-09-30T18:27:14Z","title":"Music- and Lyrics-driven Dance Synthesis","summary":"  Lyrics often convey information about the songs that are beyond the auditory\ndimension, enriching the semantic meaning of movements and musical themes. Such\ninsights are important in the dance choreography domain. However, most existing\ndance synthesis methods mainly focus on music-to-dance generation, without\nconsidering the semantic information. To complement it, we introduce JustLMD, a\nnew multimodal dataset of 3D dance motion with music and lyrics. To the best of\nour knowledge, this is the first dataset with triplet information including\ndance motion, music, and lyrics. Additionally, we showcase a cross-modal\ndiffusion-based network designed to generate 3D dance motion conditioned on\nmusic and lyrics. The proposed JustLMD dataset encompasses 4.6 hours of 3D\ndance motion in 1867 sequences, accompanied by musical tracks and their\ncorresponding English lyrics.\n","authors":["Wenjie Yin","Qingyuan Yao","Yi Yu","Hang Yin","Danica Kragic","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2310.00455v1.pdf","comment":null}]},"2023-10-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.02263v1","updated":"2023-10-03T17:59:46Z","published":"2023-10-03T17:59:46Z","title":"Contrastive Post-training Large Language Models on Data Curriculum","summary":"  Alignment serves as an important step to steer large language models (LLMs)\ntowards human preferences. In this paper, we explore contrastive post-training\ntechniques for alignment by automatically constructing preference pairs from\nmultiple models of varying strengths (e.g., InstructGPT, ChatGPT and GPT-4). We\ncarefully compare the contrastive techniques of SLiC and DPO to SFT baselines\nand find that DPO provides a step-function improvement even after continueing\nSFT saturates. We also explore a data curriculum learning scheme for\ncontrastive post-training, which starts by learning from \"easier\" pairs and\ntransitioning to \"harder\" ones, which further improves alignment. Finally, we\nscale up our experiments to train with more data and larger models like Orca.\nRemarkably, contrastive post-training further improves the performance of Orca,\nalready a state-of-the-art instruction learning model tuned with GPT-4 outputs,\nto exceed that of ChatGPT.\n","authors":["Canwen Xu","Corby Rosset","Luciano Del Corro","Shweti Mahajan","Julian McAuley","Jennifer Neville","Ahmed Hassan Awadallah","Nikhil Rao"],"pdf_url":"https://arxiv.org/pdf/2310.02263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02264v1","updated":"2023-10-03T17:59:46Z","published":"2023-10-03T17:59:46Z","title":"Generalizable Long-Horizon Manipulations with Large Language Models","summary":"  This work introduces a framework harnessing the capabilities of Large\nLanguage Models (LLMs) to generate primitive task conditions for generalizable\nlong-horizon manipulations with novel objects and unseen tasks. These task\nconditions serve as guides for the generation and adjustment of Dynamic\nMovement Primitives (DMP) trajectories for long-horizon task execution. We\nfurther create a challenging robotic manipulation task suite based on Pybullet\nfor long-horizon task evaluation. Extensive experiments in both simulated and\nreal-world environments demonstrate the effectiveness of our framework on both\nfamiliar tasks involving new objects and novel but related tasks, highlighting\nthe potential of LLMs in enhancing robotic system versatility and adaptability.\nProject website: https://object814.github.io/Task-Condition-With-LLM/\n","authors":["Haoyu Zhou","Mingyu Ding","Weikun Peng","Masayoshi Tomizuka","Lin Shao","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2310.02264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02255v1","updated":"2023-10-03T17:57:24Z","published":"2023-10-03T17:57:24Z","title":"MathVista: Evaluating Mathematical Reasoning of Foundation Models in\n  Visual Contexts","summary":"  Although Large Language Models (LLMs) and Large Multimodal Models (LMMs)\nexhibit impressive skills in various domains, their ability for mathematical\nreasoning within visual contexts has not been formally examined. Equipping LLMs\nand LMMs with this capability is vital for general-purpose AI assistants and\nshowcases promising potential in education, data analysis, and scientific\ndiscovery. To bridge this gap, we present MathVista, a benchmark designed to\namalgamate challenges from diverse mathematical and visual tasks. We first\ntaxonomize the key task types, reasoning skills, and visual contexts from the\nliterature to guide our selection from 28 existing math-focused and visual\nquestion answering datasets. Then, we construct three new datasets, IQTest,\nFunctionQA, and PaperQA, to accommodate for missing types of visual contexts.\nThe problems featured often require deep visual understanding beyond OCR or\nimage captioning, and compositional reasoning with rich domain-specific tools,\nthus posing a notable challenge to existing models. We conduct a comprehensive\nevaluation of 11 prominent open-source and proprietary foundation models (LLMs,\nLLMs augmented with tools, and LMMs), and early experiments with GPT-4V. The\nbest-performing model, Multimodal Bard, achieves only 58% of human performance\n(34.8% vs 60.3%), indicating ample room for further improvement. Given this\nsignificant gap, MathVista fuels future research in the development of\ngeneral-purpose AI agents capable of tackling mathematically intensive and\nvisually rich real-world tasks. Preliminary tests show that MathVista also\npresents challenges to GPT-4V, underscoring the benchmark's importance. The\nproject is available at https://mathvista.github.io/.\n","authors":["Pan Lu","Hritik Bansal","Tony Xia","Jiacheng Liu","Chunyuan Li","Hannaneh Hajishirzi","Hao Cheng","Kai-Wei Chang","Michel Galley","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.02255v1.pdf","comment":"51 pages, 56 figures. Work in progress"},{"id":"http://arxiv.org/abs/2310.02249v1","updated":"2023-10-03T17:53:09Z","published":"2023-10-03T17:53:09Z","title":"Harnessing Pre-Trained Sentence Transformers for Offensive Language\n  Detection in Indian Languages","summary":"  In our increasingly interconnected digital world, social media platforms have\nemerged as powerful channels for the dissemination of hate speech and offensive\ncontent. This work delves into the domain of hate speech detection, placing\nspecific emphasis on three low-resource Indian languages: Bengali, Assamese,\nand Gujarati. The challenge is framed as a text classification task, aimed at\ndiscerning whether a tweet contains offensive or non-offensive content.\nLeveraging the HASOC 2023 datasets, we fine-tuned pre-trained BERT and SBERT\nmodels to evaluate their effectiveness in identifying hate speech. Our findings\nunderscore the superiority of monolingual sentence-BERT models, particularly in\nthe Bengali language, where we achieved the highest ranking. However, the\nperformance in Assamese and Gujarati languages signifies ongoing opportunities\nfor enhancement. Our goal is to foster inclusive online spaces by countering\nhate speech proliferation.\n","authors":["Ananya Joshi","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2310.02249v1.pdf","comment":"HASOC at FIRE 2023"},{"id":"http://arxiv.org/abs/2310.02238v1","updated":"2023-10-03T17:48:14Z","published":"2023-10-03T17:48:14Z","title":"Who's Harry Potter? Approximate Unlearning in LLMs","summary":"  Large language models (LLMs) are trained on massive internet corpora that\noften contain copyrighted content. This poses legal and ethical challenges for\nthe developers and users of these models, as well as the original authors and\npublishers. In this paper, we propose a novel technique for unlearning a subset\nof the training data from a LLM, without having to retrain it from scratch.\n  We evaluate our technique on the task of unlearning the Harry Potter books\nfrom the Llama2-7b model (a generative language model recently open-sourced by\nMeta). While the model took over 184K GPU-hours to pretrain, we show that in\nabout 1 GPU hour of finetuning, we effectively erase the model's ability to\ngenerate or recall Harry Potter-related content, while its performance on\ncommon benchmarks (such as Winogrande, Hellaswag, arc, boolq and piqa) remains\nalmost unaffected. We make our fine-tuned model publicly available on\nHuggingFace for community evaluation. To the best of our knowledge, this is the\nfirst paper to present an effective technique for unlearning in generative\nlanguage models.\n  Our technique consists of three main components: First, we use a reinforced\nmodel that is further trained on the target data to identify the tokens that\nare most related to the unlearning target, by comparing its logits with those\nof a baseline model. Second, we replace idiosyncratic expressions in the target\ndata with generic counterparts, and leverage the model's own predictions to\ngenerate alternative labels for every token. These labels aim to approximate\nthe next-token predictions of a model that has not been trained on the target\ndata. Third, we finetune the model on these alternative labels, which\neffectively erases the original text from the model's memory whenever it is\nprompted with its context.\n","authors":["Ronen Eldan","Mark Russinovich"],"pdf_url":"https://arxiv.org/pdf/2310.02238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02235v1","updated":"2023-10-03T17:45:39Z","published":"2023-10-03T17:45:39Z","title":"Automatic Quality Assessment of Wikipedia Articles -- A Systematic\n  Literature Review","summary":"  Wikipedia is the world's largest online encyclopedia, but maintaining article\nquality through collaboration is challenging. Wikipedia designed a quality\nscale, but with such a manual assessment process, many articles remain\nunassessed. We review existing methods for automatically measuring the quality\nof Wikipedia articles, identifying and comparing machine learning algorithms,\narticle features, quality metrics, and used datasets, examining 149 distinct\nstudies, and exploring commonalities and gaps in them. The literature is\nextensive, and the approaches follow past technological trends. However,\nmachine learning is still not widely used by Wikipedia, and we hope that our\nanalysis helps future researchers change that reality.\n","authors":["Pedro Miguel Moás","Carla Teixeira Lopes"],"pdf_url":"https://arxiv.org/pdf/2310.02235v1.pdf","comment":"37 pages, 10 figures, just accepted in ACM Computing Surveys\n  (September 2023). This is the author's version of the work. It is posted here\n  for your personal use. Not for redistribution. The definitive Version of\n  Record was published in ACM Computing Surveys,\n  https://dx.doi.org/10.1145/3625286"},{"id":"http://arxiv.org/abs/2310.02229v1","updated":"2023-10-03T17:37:22Z","published":"2023-10-03T17:37:22Z","title":"Extraction of Medication and Temporal Relation from Clinical Text by\n  Harnessing Different Deep Learning Models","summary":"  Clinical texts, represented in electronic medical records (EMRs), contain\nrich medical information and are essential for disease prediction, personalised\ninformation recommendation, clinical decision support, and medication pattern\nmining and measurement. Relation extractions between medication mentions and\ntemporal information can further help clinicians better understand the\npatients' treatment history. To evaluate the performances of deep learning (DL)\nand large language models (LLMs) in medication extraction and temporal\nrelations classification, we carry out an empirical investigation of\n\\textbf{MedTem} project using several advanced learning structures including\nBiLSTM-CRF and CNN-BiLSTM for a clinical domain named entity recognition (NER),\nand BERT-CNN for temporal relation extraction (RE), in addition to the\nexploration of different word embedding techniques. Furthermore, we also\ndesigned a set of post-processing roles to generate structured output on\nmedications and the temporal relation. Our experiments show that CNN-BiLSTM\nslightly wins the BiLSTM-CRF model on the i2b2-2009 clinical NER task yielding\n75.67, 77.83, and 78.17 for precision, recall, and F1 scores using Macro\nAverage. BERT-CNN model also produced reasonable evaluation scores 64.48,\n67.17, and 65.03 for P/R/F1 using Macro Avg on the temporal relation extraction\ntest set from i2b2-2012 challenges. Code and Tools from MedTem will be hosted\nat \\url{https://github.com/HECTA-UoM/MedTem}\n","authors":["Hangyu Tu","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2310.02229v1.pdf","comment":"working paper, 35 pages"},{"id":"http://arxiv.org/abs/2310.02226v1","updated":"2023-10-03T17:32:41Z","published":"2023-10-03T17:32:41Z","title":"Think before you speak: Training Language Models With Pause Tokens","summary":"  Language models generate responses by producing a series of tokens in\nimmediate succession: the $(K+1)^{th}$ token is an outcome of manipulating $K$\nhidden vectors per layer, one vector per preceding token. What if instead we\nwere to let the model manipulate say, $K+10$ hidden vectors, before it outputs\nthe $(K+1)^{th}$ token? We operationalize this idea by performing training and\ninference on language models with a (learnable) $\\textit{pause}$ token, a\nsequence of which is appended to the input prefix. We then delay extracting the\nmodel's outputs until the last pause token is seen, thereby allowing the model\nto process extra computation before committing to an answer. We empirically\nevaluate $\\textit{pause-training}$ on decoder-only models of 1B and 130M\nparameters with causal pretraining on C4, and on downstream tasks covering\nreasoning, question-answering, general understanding and fact recall. Our main\nfinding is that inference-time delays show gains when the model is both\npre-trained and finetuned with delays. For the 1B model, we witness gains on 8\nof 9 tasks, most prominently, a gain of $18\\%$ EM score on the QA task of\nSQuAD, $8\\%$ on CommonSenseQA and $1\\%$ accuracy on the reasoning task of\nGSM8k. Our work raises a range of conceptual and practical future research\nquestions on making delayed next-token prediction a widely applicable new\nparadigm.\n","authors":["Sachin Goyal","Ziwei Ji","Ankit Singh Rawat","Aditya Krishna Menon","Sanjiv Kumar","Vaishnavh Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2310.02226v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.02224v1","updated":"2023-10-03T17:30:33Z","published":"2023-10-03T17:30:33Z","title":"Can Language Models be Instructed to Protect Personal Information?","summary":"  Large multimodal language models have proven transformative in numerous\napplications. However, these models have been shown to memorize and leak\npre-training data, raising serious user privacy and information security\nconcerns. While data leaks should be prevented, it is also crucial to examine\nthe trade-off between the privacy protection and model utility of proposed\napproaches. In this paper, we introduce PrivQA -- a multimodal benchmark to\nassess this privacy/utility trade-off when a model is instructed to protect\nspecific categories of personal information in a simulated scenario. We also\npropose a technique to iteratively self-moderate responses, which significantly\nimproves privacy. However, through a series of red-teaming experiments, we find\nthat adversaries can also easily circumvent these protections with simple\njailbreaking methods through textual and/or image inputs. We believe PrivQA has\nthe potential to support the development of new models with improved privacy\nprotections, as well as the adversarial robustness of these protections. We\nrelease the entire PrivQA dataset at https://llm-access-control.github.io/.\n","authors":["Yang Chen","Ethan Mendes","Sauvik Das","Wei Xu","Alan Ritter"],"pdf_url":"https://arxiv.org/pdf/2310.02224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00193v3","updated":"2023-10-03T17:20:04Z","published":"2022-10-01T05:02:04Z","title":"FRMT: A Benchmark for Few-Shot Region-Aware Machine Translation","summary":"  We present FRMT, a new dataset and evaluation benchmark for Few-shot\nRegion-aware Machine Translation, a type of style-targeted translation. The\ndataset consists of professional translations from English into two regional\nvariants each of Portuguese and Mandarin Chinese. Source documents are selected\nto enable detailed analysis of phenomena of interest, including lexically\ndistinct terms and distractor terms. We explore automatic evaluation metrics\nfor FRMT and validate their correlation with expert human evaluation across\nboth region-matched and mismatched rating scenarios. Finally, we present a\nnumber of baseline models for this task, and offer guidelines for how\nresearchers can train, evaluate, and compare their own models. Our dataset and\nevaluation code are publicly available: https://bit.ly/frmt-task\n","authors":["Parker Riley","Timothy Dozat","Jan A. Botha","Xavier Garcia","Dan Garrette","Jason Riesa","Orhan Firat","Noah Constant"],"pdf_url":"https://arxiv.org/pdf/2210.00193v3.pdf","comment":"Published in TACL Vol. 11 (2023)"},{"id":"http://arxiv.org/abs/2310.02207v1","updated":"2023-10-03T17:06:52Z","published":"2023-10-03T17:06:52Z","title":"Language Models Represent Space and Time","summary":"  The capabilities of large language models (LLMs) have sparked debate over\nwhether such systems just learn an enormous collection of superficial\nstatistics or a coherent model of the data generating process -- a world model.\nWe find evidence for the latter by analyzing the learned representations of\nthree spatial datasets (world, US, NYC places) and three temporal datasets\n(historical figures, artworks, news headlines) in the Llama-2 family of models.\nWe discover that LLMs learn linear representations of space and time across\nmultiple scales. These representations are robust to prompting variations and\nunified across different entity types (e.g. cities and landmarks). In addition,\nwe identify individual ``space neurons'' and ``time neurons'' that reliably\nencode spatial and temporal coordinates. Our analysis demonstrates that modern\nLLMs acquire structured knowledge about fundamental dimensions such as space\nand time, supporting the view that they learn not merely superficial\nstatistics, but literal world models.\n","authors":["Wes Gurnee","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2310.02207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10490v4","updated":"2023-10-03T17:03:10Z","published":"2023-07-19T23:03:20Z","title":"Abusing Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01307v2","updated":"2023-10-03T16:40:35Z","published":"2023-10-02T16:13:08Z","title":"On the Generalization of Training-based ChatGPT Detection Methods","summary":"  ChatGPT is one of the most popular language models which achieve amazing\nperformance on various natural language tasks. Consequently, there is also an\nurgent need to detect the texts generated ChatGPT from human written. One of\nthe extensively studied methods trains classification models to distinguish\nboth. However, existing studies also demonstrate that the trained models may\nsuffer from distribution shifts (during test), i.e., they are ineffective to\npredict the generated texts from unseen language tasks or topics. In this work,\nwe aim to have a comprehensive investigation on these methods' generalization\nbehaviors under distribution shift caused by a wide range of factors, including\nprompts, text lengths, topics, and language tasks. To achieve this goal, we\nfirst collect a new dataset with human and ChatGPT texts, and then we conduct\nextensive studies on the collected dataset. Our studies unveil insightful\nfindings which provide guidance for developing future methodologies or data\ncollection strategies for ChatGPT detection.\n","authors":["Han Xu","Jie Ren","Pengfei He","Shenglai Zeng","Yingqian Cui","Amy Liu","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.01307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02174v1","updated":"2023-10-03T16:08:41Z","published":"2023-10-03T16:08:41Z","title":"Ask Again, Then Fail: Large Language Models' Vacillations in Judgement","summary":"  With the emergence of generative conversational large language models (LLMs)\nlike ChatGPT, serving as virtual assistants in various fields, the stability\nand reliability of their responses have become crucial. However, during usage,\nit has been observed that these models tend to waver in their judgements when\nconfronted with follow-up questions from users expressing skepticism or\ndisagreement. In this work, we draw inspiration from questioning strategies in\neducation and propose a \\textsc{Follow-up Questioning Mechanism} along with two\nevaluation metrics to assess the judgement consistency of LLMs before and after\nexposure to disturbances. We evaluate the judgement consistency of ChatGPT,\nPaLM2-Bison, and Vicuna-13B under this mechanism across eight reasoning\nbenchmarks. Empirical results show that even when the initial answers are\ncorrect, judgement consistency sharply decreases when LLMs face disturbances\nsuch as questioning, negation, or misleading. Additionally, we study these\nmodels' judgement consistency under various settings (sampling temperature and\nprompts) to validate this issue further, observing the impact of prompt tone\nand conducting an in-depth error analysis for deeper behavioral insights.\nFurthermore, we also explore several prompting methods to mitigate this issue\nand demonstrate their\neffectiveness\\footnote{\\url{https://github.com/NUSTM/LLMs-Waver-In-Judgements}}.\n","authors":["Qiming Xie","Zengzhi Wang","Yi Feng","Rui Xia"],"pdf_url":"https://arxiv.org/pdf/2310.02174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02170v1","updated":"2023-10-03T16:05:48Z","published":"2023-10-03T16:05:48Z","title":"Dynamic LLM-Agent Network: An LLM-agent Collaboration Framework with\n  Agent Team Optimization","summary":"  Large language model (LLM) agents have been shown effective on a wide range\nof tasks, and by ensembling multiple LLM agents, their performances could be\nfurther improved. Existing approaches employ a fixed set of agents to interact\nwith each other in a static architecture, which limits their generalizability\nto various tasks and requires strong human prior in designing these agents. In\nthis work, we propose to construct a strategic team of agents communicating in\na dynamic interaction architecture based on the task query. Specifically, we\nbuild a framework named Dynamic LLM-Agent Network ($\\textbf{DyLAN}$) for\nLLM-agent collaboration on complicated tasks like reasoning and code\ngeneration. DyLAN enables agents to interact for multiple rounds in a dynamic\narchitecture with inference-time agent selection and an early-stopping\nmechanism to improve performance and efficiency. We further design an automatic\nagent team optimization algorithm based on an unsupervised metric termed\n$\\textit{Agent Importance Score}$, enabling the selection of best agents based\non the contribution each agent makes. Empirically, we demonstrate that DyLAN\nperforms well in both reasoning and code generation tasks with reasonable\ncomputational cost. DyLAN achieves 13.0% and 13.3% improvement on MATH and\nHumanEval, respectively, compared to a single execution on GPT-35-turbo. On\nspecific subjects of MMLU, agent team optimization in DyLAN increases accuracy\nby up to 25.0%.\n","authors":["Zijun Liu","Yanzhe Zhang","Peng Li","Yang Liu","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02170v1.pdf","comment":"Preprint, under review. 21 pages"},{"id":"http://arxiv.org/abs/2310.02168v1","updated":"2023-10-03T16:02:36Z","published":"2023-10-03T16:02:36Z","title":"Editing Personality for LLMs","summary":"  This paper introduces an innovative task focused on editing the personality\ntraits of Large Language Models (LLMs). This task seeks to adjust the models'\nresponses to opinion-related questions on specified topics since an\nindividual's personality often manifests in the form of their expressed\nopinions, thereby showcasing different personality traits. Specifically, we\nconstruct a new benchmark dataset PersonalityEdit to address this task. Drawing\non the theory in Social Psychology, we isolate three representative traits,\nnamely Neuroticism, Extraversion, and Agreeableness, as the foundation for our\nbenchmark. We then gather data using GPT-4, generating responses that not only\nalign with a specified topic but also embody the targeted personality trait. We\nconduct comprehensive experiments involving various baselines and discuss the\nrepresentation of personality behavior in LLMs. Our intriguing findings uncover\npotential challenges of the proposed task, illustrating several remaining\nissues. We anticipate that our work can provide the NLP community with\ninsights. Code and datasets will be released at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Shengyu Mao","Ningyu Zhang","Xiaohan Wang","Mengru Wang","Yunzhi Yao","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02168v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.02166v1","updated":"2023-10-03T15:57:00Z","published":"2023-10-03T15:57:00Z","title":"Large Language Models Meet Knowledge Graphs to Answer Factoid Questions","summary":"  Recently, it has been shown that the incorporation of structured knowledge\ninto Large Language Models significantly improves the results for a variety of\nNLP tasks. In this paper, we propose a method for exploring pre-trained\nText-to-Text Language Models enriched with additional information from\nKnowledge Graphs for answering factoid questions. More specifically, we propose\nan algorithm for subgraphs extraction from a Knowledge Graph based on question\nentities and answer candidates. Then, we procure easily interpreted information\nwith Transformer-based models through the linearization of the extracted\nsubgraphs. Final re-ranking of the answer candidates with the extracted\ninformation boosts Hits@1 scores of the pre-trained text-to-text language\nmodels by 4-6%.\n","authors":["Mikhail Salnikov","Hai Le","Prateek Rajput","Irina Nikishina","Pavel Braslavski","Valentin Malykh","Alexander Panchenko"],"pdf_url":"https://arxiv.org/pdf/2310.02166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00674v4","updated":"2023-10-03T15:50:30Z","published":"2023-02-01T18:59:36Z","title":"Improving Few-Shot Generalization by Exploring and Exploiting Auxiliary\n  Data","summary":"  Few-shot learning is valuable in many real-world applications, but learning a\ngeneralizable model without overfitting to the few labeled datapoints is\nchallenging. In this work, we focus on Few-shot Learning with Auxiliary Data\n(FLAD), a training paradigm that assumes access to auxiliary data during\nfew-shot learning in hopes of improving generalization. Previous works have\nproposed automated methods for mixing auxiliary and target data, but these\nmethods typically scale linearly (or worse) with the number of auxiliary\ndatasets, limiting their practicality. In this work we relate FLAD to the\nexplore-exploit dilemma that is central to the multi-armed bandit setting and\nderive algorithms whose computational complexity is independent of the number\nof auxiliary datasets, allowing us to scale to 100x more auxiliary datasets\nthan prior methods. We propose two algorithms -- EXP3-FLAD and UCB1-FLAD -- and\ncompare them with prior FLAD methods that either explore or exploit, finding\nthat the combination of exploration and exploitation is crucial. Through\nextensive experimentation we find that our methods outperform all pre-existing\nFLAD methods by 4% and lead to the first 3 billion parameter language models\nthat outperform the 175 billion parameter GPT-3. Overall, our work suggests\nthat the discovery of better, more efficient mixing strategies for FLAD may\nprovide a viable path towards substantially improving generalization in\nfew-shot learning.\n","authors":["Alon Albalak","Colin Raffel","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2302.00674v4.pdf","comment":"NeurIPS 2023, 25 pages, 8 figures, code available at\n  https://github.com/alon-albalak/FLAD"},{"id":"http://arxiv.org/abs/2310.02129v1","updated":"2023-10-03T15:10:46Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":"  As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code will be released at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.02124v1","updated":"2023-10-03T15:05:52Z","published":"2023-10-03T15:05:52Z","title":"Exploring Collaboration Mechanisms for LLM Agents: A Social Psychology\n  View","summary":"  As Natural Language Processing (NLP) systems are increasingly employed in\nintricate social environments, a pressing query emerges: Can these NLP systems\nmirror human-esque collaborative intelligence, in a multi-agent society\nconsisting of multiple large language models (LLMs)? This paper probes the\ncollaboration mechanisms among contemporary NLP systems by melding practical\nexperiments with theoretical insights. We fabricate four unique `societies'\ncomprised of LLM agents, where each agent is characterized by a specific\n`trait' (easy-going or overconfident) and engages in collaboration with a\ndistinct `thinking pattern' (debate or reflection). Evaluating these\nmulti-agent societies on three benchmark datasets, we discern that LLM agents\nnavigate tasks by leveraging diverse social behaviors, from active debates to\nintrospective reflections. Notably, certain collaborative strategies only\noptimize efficiency (using fewer API tokens), but also outshine previous\ntop-tier approaches. Moreover, our results further illustrate that LLM agents\nmanifest human-like social behaviors, such as conformity or majority rule,\nmirroring foundational Social Psychology theories. In conclusion, we integrate\ninsights from Social Psychology to contextualize the collaboration of LLM\nagents, inspiring further investigations into the collaboration mechanism for\nLLMs. We commit to sharing our code and datasets (already submitted in\nsupplementary materials), hoping to catalyze further research in this promising\navenue (All code and data are available at\n\\url{https://github.com/zjunlp/MachineSoM}.).\n","authors":["Jintian Zhang","Xin Xu","Shumin Deng"],"pdf_url":"https://arxiv.org/pdf/2310.02124v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2310.02118v1","updated":"2023-10-03T14:59:35Z","published":"2023-10-03T14:59:35Z","title":"TWIZ: The Wizard of Multimodal Conversational-Stimulus","summary":"  In this report, we describe the vision, challenges, and scientific\ncontributions of the Task Wizard team, TWIZ, in the Alexa Prize TaskBot\nChallenge 2022. Our vision, is to build TWIZ bot as an helpful, multimodal,\nknowledgeable, and engaging assistant that can guide users towards the\nsuccessful completion of complex manual tasks. To achieve this, we focus our\nefforts on three main research questions: (1) Humanly-Shaped Conversations, by\nproviding information in a knowledgeable way; (2) Multimodal Stimulus, making\nuse of various modalities including voice, images, and videos; and (3)\nZero-shot Conversational Flows, to improve the robustness of the interaction to\nunseen scenarios. TWIZ is an assistant capable of supporting a wide range of\ntasks, with several innovative features such as creative cooking, video\nnavigation through voice, and the robust TWIZ-LLM, a Large Language Model\ntrained for dialoguing about complex manual tasks. Given ratings and feedback\nprovided by users, we observed that TWIZ bot is an effective and robust system,\ncapable of guiding users through tasks while providing several multimodal\nstimuli.\n","authors":["Rafael Ferreira","Diogo Tavares","Diogo Silva","Rodrigo Valério","João Bordalo","Inês Simões","Vasco Ramos","David Semedo","João Magalhães"],"pdf_url":"https://arxiv.org/pdf/2310.02118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02107v1","updated":"2023-10-03T14:51:34Z","published":"2023-10-03T14:51:34Z","title":"Instance Needs More Care: Rewriting Prompts for Instances Yields Better\n  Zero-Shot Performance","summary":"  Enabling large language models (LLMs) to perform tasks in zero-shot has been\nan appealing goal owing to its labor-saving (i.e., requiring no task-specific\nannotations); as such, zero-shot prompting approaches also enjoy better task\ngeneralizability. To improve LLMs' zero-shot performance, prior work has\nfocused on devising more effective task instructions (e.g., ``let's think step\nby step'' ). However, we argue that, in order for an LLM to solve them\ncorrectly in zero-shot, individual test instances need more carefully designed\nand customized instructions. To this end, we propose PRoMPTd, an approach that\nrewrites the task prompt for each individual test input to be more specific,\nunambiguous, and complete, so as to provide better guidance to the task LLM. We\nevaluated PRoMPTd on eight datasets covering tasks including arithmetics,\nlogical reasoning, and code generation, using GPT-4 as the task LLM. Notably,\n\\algoname achieves an absolute improvement of around 10\\% on the complex MATH\ndataset and 5\\% on the code generation task on HumanEval, outperforming\nconventional zero-shot methods. In addition, we also showed that the rewritten\nprompt can provide better interpretability of how the LLM resolves each test\ninstance, which can potentially be leveraged as a defense mechanism against\nadversarial prompting. The source code and dataset can be obtained from\nhttps://github.com/salokr/PRoMPTd\n","authors":["Saurabh Srivastava","Chengyue Huang","Weiguo Fan","Ziyu Yao"],"pdf_url":"https://arxiv.org/pdf/2310.02107v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.16789v2","updated":"2023-10-03T14:45:48Z","published":"2023-07-31T15:56:53Z","title":"ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world\n  APIs","summary":"  Despite the advancements of open-source large language models (LLMs), e.g.,\nLLaMA, they remain significantly limited in tool-use capabilities, i.e., using\nexternal tools (APIs) to fulfill human instructions. The reason is that current\ninstruction tuning largely focuses on basic language tasks but ignores the\ntool-use domain. This is in contrast to the excellent tool-use capabilities of\nstate-of-the-art (SOTA) closed-source LLMs, e.g., ChatGPT. To bridge this gap,\nwe introduce ToolLLM, a general tool-use framework encompassing data\nconstruction, model training, and evaluation. We first present ToolBench, an\ninstruction-tuning dataset for tool use, which is constructed automatically\nusing ChatGPT. Specifically, the construction can be divided into three stages:\n(i) API collection: we collect 16,464 real-world RESTful APIs spanning 49\ncategories from RapidAPI Hub; (ii) instruction generation: we prompt ChatGPT to\ngenerate diverse instructions involving these APIs, covering both single-tool\nand multi-tool scenarios; (iii) solution path annotation: we use ChatGPT to\nsearch for a valid solution path (chain of API calls) for each instruction. To\nenhance the reasoning capabilities of LLMs, we develop a novel depth-first\nsearch-based decision tree algorithm. It enables LLMs to evaluate multiple\nreasoning traces and expand the search space. Moreover, to evaluate the\ntool-use capabilities of LLMs, we develop an automatic evaluator: ToolEval.\nBased on ToolBench, we fine-tune LLaMA to obtain an LLM ToolLLaMA, and equip it\nwith a neural API retriever to recommend appropriate APIs for each instruction.\nExperiments show that ToolLLaMA demonstrates a remarkable ability to execute\ncomplex instructions and generalize to unseen APIs, and exhibits comparable\nperformance to ChatGPT. Our ToolLLaMA also demonstrates strong zero-shot\ngeneralization ability in an out-of-distribution tool-use dataset: APIBench.\n","authors":["Yujia Qin","Shihao Liang","Yining Ye","Kunlun Zhu","Lan Yan","Yaxi Lu","Yankai Lin","Xin Cong","Xiangru Tang","Bill Qian","Sihan Zhao","Lauren Hong","Runchu Tian","Ruobing Xie","Jie Zhou","Mark Gerstein","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02071v1","updated":"2023-10-03T14:13:36Z","published":"2023-10-03T14:13:36Z","title":"Towards End-to-End Embodied Decision Making via Multi-modal Large\n  Language Model: Explorations with GPT4-Vision and Beyond","summary":"  In this study, we explore the potential of Multimodal Large Language Models\n(MLLMs) in improving embodied decision-making processes for agents. While Large\nLanguage Models (LLMs) have been widely used due to their advanced reasoning\nskills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual\nunderstanding and reasoning capabilities. We investigate whether\nstate-of-the-art MLLMs can handle embodied decision-making in an end-to-end\nmanner and whether collaborations between LLMs and MLLMs can enhance\ndecision-making. To address these questions, we introduce a new benchmark\ncalled PCA-EVAL, which evaluates embodied decision-making from the perspectives\nof Perception, Cognition, and Action. Additionally, we propose HOLMES, a\nmulti-agent cooperation framework that allows LLMs to leverage MLLMs and APIs\nto gather multimodal information for informed decision-making. We compare\nend-to-end embodied decision-making and HOLMES on our benchmark and find that\nthe GPT4-Vision model demonstrates strong end-to-end embodied decision-making\nabilities, outperforming GPT4-HOLMES in terms of average decision accuracy\n(+3%). However, this performance is exclusive to the latest GPT4-Vision model,\nsurpassing the open-source state-of-the-art MLLM by 26%. Our results indicate\nthat powerful MLLMs like GPT4-Vision hold promise for decision-making in\nembodied agents, offering new avenues for MLLM research.\n","authors":["Liang Chen","Yichi Zhang","Shuhuai Ren","Haozhe Zhao","Zefan Cai","Yuchi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.02071v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2310.00835v2","updated":"2023-10-03T13:54:02Z","published":"2023-10-02T00:59:07Z","title":"TRAM: Benchmarking Temporal Reasoning for Large Language Models","summary":"  Reasoning about time is essential for understanding the nuances of events\ndescribed in natural language. Previous research on this topic has been limited\nin scope, characterized by a lack of standardized benchmarks that would allow\nfor consistent evaluations across different studies. In this paper, we\nintroduce TRAM, a temporal reasoning benchmark composed of ten datasets,\nencompassing various temporal aspects of events such as order, arithmetic,\nfrequency, and duration, designed to facilitate a comprehensive evaluation of\nthe temporal reasoning capabilities of large language models (LLMs). We conduct\nan extensive evaluation using popular LLMs, such as GPT-4 and Llama2, in both\nzero-shot and few-shot learning scenarios. Additionally, we employ BERT-based\nmodels to establish the baseline evaluations. Our findings indicate that these\nmodels still trail human performance in temporal reasoning tasks. It is our\naspiration that TRAM will spur further progress in enhancing the temporal\nreasoning abilities of LLMs.\n","authors":["Yuqing Wang","Yun Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.00835v2.pdf","comment":"21 pages, in submission"},{"id":"http://arxiv.org/abs/2310.02053v1","updated":"2023-10-03T13:51:01Z","published":"2023-10-03T13:51:01Z","title":"Controlling Topic-Focus Articulation in Meaning-to-Text Generation using\n  Graph Neural Networks","summary":"  A bare meaning representation can be expressed in various ways using natural\nlanguage, depending on how the information is structured on the surface level.\nWe are interested in finding ways to control topic-focus articulation when\ngenerating text from meaning. We focus on distinguishing active and passive\nvoice for sentences with transitive verbs. The idea is to add pragmatic\ninformation such as topic to the meaning representation, thereby forcing either\nactive or passive voice when given to a natural language generation system. We\nuse graph neural models because there is no explicit information about word\norder in a meaning represented by a graph. We try three different methods for\ntopic-focus articulation (TFA) employing graph neural models for a\nmeaning-to-text generation task. We propose a novel encoding strategy about\nnode aggregation in graph neural models, which instead of traditional encoding\nby aggregating adjacent node information, learns node representations by using\ndepth-first search. The results show our approach can get competitive\nperformance with state-of-art graph models on general text generation, and lead\nto significant improvements on the task of active-passive conversion compared\nto traditional adjacency-based aggregation strategies. Different types of TFA\ncan have a huge impact on the performance of the graph models.\n","authors":["Chunliu Wang","Rik van Noord","Johan Bos"],"pdf_url":"https://arxiv.org/pdf/2310.02053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02050v1","updated":"2023-10-03T13:43:50Z","published":"2023-10-03T13:43:50Z","title":"Tuning Large language model for End-to-end Speech Translation","summary":"  With the emergence of large language models (LLMs), multimodal models based\non LLMs have demonstrated significant potential. Models such as LLaSM, X-LLM,\nand SpeechGPT exhibit an impressive ability to comprehend and generate human\ninstructions. However, their performance often falters when faced with complex\ntasks like end-to-end speech translation (E2E-ST), a cross-language and\ncross-modal translation task. In comparison to single-modal models, multimodal\nmodels lag behind in these scenarios. This paper introduces LST, a Large\nmultimodal model designed to excel at the E2E-ST task. LST consists of a speech\nfrontend, an adapter, and a LLM backend. The training of LST consists of two\nstages: (1) Modality adjustment, where the adapter is tuned to align speech\nrepresentation with text embedding space, and (2) Downstream task fine-tuning,\nwhere both the adapter and LLM model are trained to optimize performance on the\nE2EST task. Experimental results on the MuST-C speech translation benchmark\ndemonstrate that LST-13B achieves BLEU scores of 30.39/41.55/35.33 on\nEn-De/En-Fr/En-Es language pairs, surpassing previous models and establishing a\nnew state-of-the-art. Additionally, we conduct an in-depth analysis of\nsingle-modal model selection and the impact of training strategies, which lays\nthe foundation for future research. We will open up our code and models after\nreview.\n","authors":["Hao Zhang","Nianwen Si","Yaqi Chen","Wenlin Zhang","Xukui Yang","Dan Qu","Xiaolin Jiao"],"pdf_url":"https://arxiv.org/pdf/2310.02050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02040v1","updated":"2023-10-03T13:31:28Z","published":"2023-10-03T13:31:28Z","title":"Jury: A Comprehensive Evaluation Toolkit","summary":"  Evaluation plays a critical role in deep learning as a fundamental block of\nany prediction-based system. However, the vast number of Natural Language\nProcessing (NLP) tasks and the development of various metrics have led to\nchallenges in evaluating different systems with different metrics. To address\nthese challenges, we introduce jury, a toolkit that provides a unified\nevaluation framework with standardized structures for performing evaluation\nacross different tasks and metrics. The objective of jury is to standardize and\nimprove metric evaluation for all systems and aid the community in overcoming\nthe challenges in evaluation. Since its open-source release, jury has reached a\nwide audience and is available at https://github.com/obss/jury.\n","authors":["Devrim Cavusoglu","Ulas Sert","Secil Sen","Sinan Altinuc"],"pdf_url":"https://arxiv.org/pdf/2310.02040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01206v2","updated":"2023-10-03T13:19:40Z","published":"2023-10-02T13:48:16Z","title":"appjsonify: An Academic Paper PDF-to-JSON Conversion Toolkit","summary":"  We present appjsonify, a Python-based PDF-to-JSON conversion toolkit for\nacademic papers. It parses a PDF file using several visual-based document\nlayout analysis models and rule-based text processing approaches. appjsonify is\na flexible tool that allows users to easily configure the processing pipeline\nto handle a specific format of a paper they wish to process. We are publicly\nreleasing appjsonify as an easy-to-install toolkit available via PyPI and\nGitHub.\n","authors":["Atsuki Yamaguchi","Terufumi Morishita"],"pdf_url":"https://arxiv.org/pdf/2310.01206v2.pdf","comment":"Preprint. PyPI: https://pypi.org/project/appjsonify/ GitHub:\n  https://pypi.org/project/appjsonify/. Fixed Figure 1 containing paper PDF\n  examples"},{"id":"http://arxiv.org/abs/2310.02031v1","updated":"2023-10-03T13:17:35Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":"  Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reason may be the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean\ndomain, which is expert in various ocean science tasks. We propose DoInstruct,\na novel framework to automatically obtain a large volume of ocean domain\ninstruction data, which generates instructions based on multi-agent\ncollaboration. Additionally, we construct the first oceanography benchmark,\nOceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though\ncomprehensive experiments, OceanGPT not only shows a higher level of knowledge\nexpertise for oceans science tasks but also gains preliminary embodied\nintelligence capabilities in ocean technology. Codes, data and checkpoints will\nsoon be available at https://github.com/zjunlp/KnowLM.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v1.pdf","comment":"Work in progress. Project Website:\n  https://zjunlp.github.io/project/OceanGPT/"},{"id":"http://arxiv.org/abs/2305.13269v2","updated":"2023-10-03T12:30:20Z","published":"2023-05-22T17:34:23Z","title":"Chain-of-Knowledge: Grounding Large Language Models via Dynamic\n  Knowledge Adapting over Heterogeneous Sources","summary":"  We present chain-of-knowledge (CoK), a novel framework that augments large\nlanguage models (LLMs) by dynamically incorporating grounding information from\nheterogeneous sources. It results in more factual rationales and reduced\nhallucination in generation. Specifically, CoK consists of three stages:\nreasoning preparation, dynamic knowledge adapting, and answer consolidation.\nGiven a knowledge-intensive question, CoK first prepares several preliminary\nrationales and answers while identifying the relevant knowledge domains. If\nthere is no majority consensus among the answers from samples, CoK corrects the\nrationales step by step by adapting knowledge from the identified domains.\nThese corrected rationales can plausibly serve as a better foundation for the\nfinal answer consolidation. Unlike prior studies that primarily use\nunstructured data, CoK also leverages structured knowledge sources such as\nWikidata and tables that provide more reliable factual information. To access\nboth unstructured and structured knowledge sources in the dynamic knowledge\nadapting stage, we propose an adaptive query generator that allows the\ngeneration of queries for various types of query languages, including SPARQL,\nSQL, and natural sentences. Moreover, to minimize error propagation between\nrationales, CoK corrects the rationales progressively using preceding corrected\nrationales to generate and correct subsequent rationales. Extensive experiments\nshow that CoK consistently improves the performance of LLMs on\nknowledge-intensive tasks across different domains.\n","authors":["Xingxuan Li","Ruochen Zhao","Yew Ken Chia","Bosheng Ding","Shafiq Joty","Soujanya Poria","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2305.13269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01991v1","updated":"2023-10-03T12:03:06Z","published":"2023-10-03T12:03:06Z","title":"Fill in the Blank: Exploring and Enhancing LLM Capabilities for Backward\n  Reasoning in Math Word Problems","summary":"  While forward reasoning (i.e. find the answer given the question) has been\nexplored extensively in the recent literature, backward reasoning is relatively\nunexplored. We examine the backward reasoning capabilities of LLMs on Math Word\nProblems (MWPs): given a mathematical question and its answer, with some\ndetails omitted from the question, can LLMs effectively retrieve the missing\ninformation?\n  In this paper, we formally define the backward reasoning task on math word\nproblems and modify three datasets to evaluate this task: GSM8k, SVAMP and\nMultiArith. Our findings show a significant drop in the accuracy of models on\nbackward reasoning compared to forward reasoning across four SOTA LLMs (GPT4,\nGPT3.5, PaLM-2, and LLaMa-2). Utilizing the specific format of this task, we\npropose three novel techniques that improve performance: Rephrase reformulates\nthe given problem into a forward reasoning problem, PAL-Tools combines the idea\nof Program-Aided LLMs to produce a set of equations that can be solved by an\nexternal solver, and Check your Work exploits the availability of natural\nverifier of high accuracy in the forward direction, interleaving solving and\nverification steps. Finally, realizing that each of our base methods correctly\nsolves a different set of problems, we propose a novel Bayesian formulation for\ncreating an ensemble over these base methods aided by a verifier to further\nboost the accuracy by a significant margin. Extensive experimentation\ndemonstrates that our techniques successively improve the performance of LLMs\non the backward reasoning task, with the final ensemble-based method resulting\nin a substantial performance gain compared to the raw LLMs with standard\nprompting techniques such as chain-of-thought.\n","authors":["Aniruddha Deb","Neeva Oza","Sarthak Singla","Dinesh Khandelwal","Dinesh Garg","Parag Singla"],"pdf_url":"https://arxiv.org/pdf/2310.01991v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.01960v1","updated":"2023-10-03T11:11:55Z","published":"2023-10-03T11:11:55Z","title":"Language Models as Knowledge Bases for Visual Word Sense Disambiguation","summary":"  Visual Word Sense Disambiguation (VWSD) is a novel challenging task that lies\nbetween linguistic sense disambiguation and fine-grained multimodal retrieval.\nThe recent advancements in the development of visiolinguistic (VL) transformers\nsuggest some off-the-self implementations with encouraging results, which\nhowever we argue that can be further improved. To this end, we propose some\nknowledge-enhancement techniques towards improving the retrieval performance of\nVL transformers via the usage of Large Language Models (LLMs) as Knowledge\nBases. More specifically, knowledge stored in LLMs is retrieved with the help\nof appropriate prompts in a zero-shot manner, achieving performance\nadvancements. Moreover, we convert VWSD to a purely textual question-answering\n(QA) problem by considering generated image captions as multiple-choice\ncandidate answers. Zero-shot and few-shot prompting strategies are leveraged to\nexplore the potential of such a transformation, while Chain-of-Thought (CoT)\nprompting in the zero-shot setting is able to reveal the internal reasoning\nsteps an LLM follows to select the appropriate candidate. In total, our\npresented approach is the first one to analyze the merits of exploiting\nknowledge stored in LLMs in different ways to solve WVSD.\n","authors":["Anastasia Kritharoula","Maria Lymperaiou","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2310.01960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01957v1","updated":"2023-10-03T11:05:14Z","published":"2023-10-03T11:05:14Z","title":"Driving with LLMs: Fusing Object-Level Vector Modality for Explainable\n  Autonomous Driving","summary":"  Large Language Models (LLMs) have shown promise in the autonomous driving\nsector, particularly in generalization and interpretability. We introduce a\nunique object-level multimodal LLM architecture that merges vectorized numeric\nmodalities with a pre-trained LLM to improve context understanding in driving\nsituations. We also present a new dataset of 160k QA pairs derived from 10k\ndriving scenarios, paired with high quality control commands collected with RL\nagent and question answer pairs generated by teacher LLM (GPT-3.5). A distinct\npretraining strategy is devised to align numeric vector modalities with static\nLLM representations using vector captioning language data. We also introduce an\nevaluation metric for Driving QA and demonstrate our LLM-driver's proficiency\nin interpreting driving scenarios, answering questions, and decision-making.\nOur findings highlight the potential of LLM-based driving action generation in\ncomparison to traditional behavioral cloning. We make our benchmark, datasets,\nand model available for further exploration.\n","authors":["Long Chen","Oleg Sinavski","Jan Hünermann","Alice Karnsund","Andrew James Willmott","Danny Birch","Daniel Maund","Jamie Shotton"],"pdf_url":"https://arxiv.org/pdf/2310.01957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01929v1","updated":"2023-10-03T10:13:36Z","published":"2023-10-03T10:13:36Z","title":"Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of\n  Text-To-Image Models","summary":"  Text-To-Image (TTI) models, exemplified by DALL-E and StableDiffusion, have\nrecently gained prominence for their remarkable zero-shot capabilities in\ngenerating images guided by textual prompts. Language, as a conduit of culture,\nplays a pivotal role in these models' multilingual capabilities, which in turn\nshape their cultural agency. In this study, we explore the cultural perception\nembedded in TTI models by characterizing culture across three hierarchical\ntiers: cultural dimensions, cultural domains, and cultural concepts. We propose\na comprehensive suite of evaluation techniques, including intrinsic evaluations\nusing the CLIP space, extrinsic evaluations with a Visual-Question-Answer (VQA)\nmodel, and human assessments, to discern TTI cultural perceptions. To\nfacilitate our research, we introduce the CulText2I dataset, derived from four\ndiverse TTI models and spanning ten languages. Our experiments reveal insights\ninto these models' cultural awareness, cultural distinctions, and the unlocking\nof cultural features, releasing the potential for cross-cultural applications.\n","authors":["Mor Ventura","Eyal Ben-David","Anna Korhonen","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.01929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01917v1","updated":"2023-10-03T09:46:02Z","published":"2023-10-03T09:46:02Z","title":"Hierarchical Evaluation Framework: Best Practices for Human Evaluation","summary":"  Human evaluation plays a crucial role in Natural Language Processing (NLP) as\nit assesses the quality and relevance of developed systems, thereby\nfacilitating their enhancement. However, the absence of widely accepted human\nevaluation metrics in NLP hampers fair comparisons among different systems and\nthe establishment of universal assessment standards. Through an extensive\nanalysis of existing literature on human evaluation metrics, we identified\nseveral gaps in NLP evaluation methodologies. These gaps served as motivation\nfor developing our own hierarchical evaluation framework. The proposed\nframework offers notable advantages, particularly in providing a more\ncomprehensive representation of the NLP system's performance. We applied this\nframework to evaluate the developed Machine Reading Comprehension system, which\nwas utilized within a human-AI symbiosis model. The results highlighted the\nassociations between the quality of inputs and outputs, underscoring the\nnecessity to evaluate both components rather than solely focusing on outputs.\nIn future work, we will investigate the potential time-saving benefits of our\nproposed framework for evaluators assessing NLP systems.\n","authors":["Iva Bojic","Jessica Chen","Si Yuan Chang","Qi Chwen Ong","Shafiq Joty","Josip Car"],"pdf_url":"https://arxiv.org/pdf/2310.01917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01889v1","updated":"2023-10-03T08:44:50Z","published":"2023-10-03T08:44:50Z","title":"Ring Attention with Blockwise Transformers for Near-Infinite Context","summary":"  Transformers have emerged as the architecture of choice for many\nstate-of-the-art AI models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands imposed by Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving extended sequences or long-term dependencies. We present a\ndistinct approach, Ring Attention, which leverages blockwise computation of\nself-attention to distribute long sequences across multiple devices while\nconcurrently overlapping the communication of key-value blocks with the\ncomputation of blockwise attention. By processing longer input sequences while\nmaintaining memory efficiency, Ring Attention enables training and inference of\nsequences that are device count times longer than those of prior\nmemory-efficient Transformers, effectively eliminating the memory constraints\nimposed by individual devices. Extensive experiments on language modeling tasks\ndemonstrate the effectiveness of Ring Attention in allowing large sequence\ninput size and improving performance.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.01889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01886v1","updated":"2023-10-03T08:39:33Z","published":"2023-10-03T08:39:33Z","title":"Effective and Parameter-Efficient Reusing Fine-Tuned Models","summary":"  Many pre-trained large-scale models provided online have become highly\neffective in transferring to downstream tasks. At the same time, various\ntask-specific models fine-tuned on these pre-trained models are available\nonline for public use. In practice, as collecting task-specific data is\nlabor-intensive and fine-tuning the large pre-trained models is computationally\nexpensive, one can reuse task-specific finetuned models to deal with downstream\ntasks. However, using a model per task causes a heavy burden on storage and\nserving. Recently, many training-free and parameter-efficient methods have been\nproposed for reusing multiple fine-tuned task-specific models into a single\nmulti-task model. However, these methods exhibit a large accuracy gap compared\nwith using a fine-tuned model per task. In this paper, we propose\nParameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing\nFully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task\nvector into a merged model by magnitude pruning. For reusing LoRA fine-tuned\nmodels, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA\nmatrix by singular value decomposition. Both PERUFFT and PERU-LoRA are\ntraining-free. Extensive experiments conducted on computer vision and natural\nlanguage process tasks demonstrate the effectiveness and parameter-efficiency\nof the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform\nexisting reusing model methods by a large margin and achieve comparable\nperformance to using a fine-tuned model per task.\n","authors":["Weisen Jiang","Baijiong Lin","Han Shi","Yu Zhang","and Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.01886v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2310.01405v2","updated":"2023-10-03T08:39:09Z","published":"2023-10-02T17:59:07Z","title":"Representation Engineering: A Top-Down Approach to AI Transparency","summary":"  In this paper, we identify and characterize the emerging area of\nrepresentation engineering (RepE), an approach to enhancing the transparency of\nAI systems that draws on insights from cognitive neuroscience. RepE places\npopulation-level representations, rather than neurons or circuits, at the\ncenter of analysis, equipping us with novel methods for monitoring and\nmanipulating high-level cognitive phenomena in deep neural networks (DNNs). We\nprovide baselines and an initial analysis of RepE techniques, showing that they\noffer simple yet effective solutions for improving our understanding and\ncontrol of large language models. We showcase how these methods can provide\ntraction on a wide range of safety-relevant problems, including honesty,\nharmlessness, power-seeking, and more, demonstrating the promise of top-down\ntransparency research. We hope that this work catalyzes further exploration of\nRepE and fosters advancements in the transparency and safety of AI systems.\n","authors":["Andy Zou","Long Phan","Sarah Chen","James Campbell","Phillip Guo","Richard Ren","Alexander Pan","Xuwang Yin","Mantas Mazeika","Ann-Kathrin Dombrowski","Shashwat Goel","Nathaniel Li","Michael J. Byun","Zifan Wang","Alex Mallen","Steven Basart","Sanmi Koyejo","Dawn Song","Matt Fredrikson","J. Zico Kolter","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2310.01405v2.pdf","comment":"Code is available at\n  https://github.com/andyzoujm/representation-engineering"},{"id":"http://arxiv.org/abs/2307.12375v3","updated":"2023-10-03T08:19:44Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning Learns Label Relationships but Is Not Conventional\n  Learning","summary":"  The predictions of Large Language Models (LLMs) on downstream tasks often\nimprove significantly when including examples of the input--label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works. For example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we provide novel insights into how ICL leverages label information,\nrevealing both capabilities and limitations. To ensure we obtain a\ncomprehensive picture of ICL behavior, we study probabilistic aspects of ICL\npredictions and thoroughly examine the dynamics of ICL as more examples are\nprovided. Our experiments show that ICL predictions almost always depend on\nin-context labels, and that ICL can learn truly novel tasks in-context.\nHowever, we also find that ICL struggles to fully overcome prediction\npreferences acquired from pre-training data, and, further, that ICL does not\nconsider all in-context information equally.\n","authors":["Jannik Kossen","Yarin Gal","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2307.12375v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00602v2","updated":"2023-10-03T08:00:53Z","published":"2023-10-01T07:31:00Z","title":"Wavelet Scattering Transform for Improving Generalization in\n  Low-Resourced Spoken Language Identification","summary":"  Commonly used features in spoken language identification (LID), such as\nmel-spectrogram or MFCC, lose high-frequency information due to windowing. The\nloss further increases for longer temporal contexts. To improve generalization\nof the low-resourced LID systems, we investigate an alternate feature\nrepresentation, wavelet scattering transform (WST), that compensates for the\nshortcomings. To our knowledge, WST is not explored earlier in LID tasks. We\nfirst optimize WST features for multiple South Asian LID corpora. We show that\nLID requires low octave resolution and frequency-scattering is not useful.\nFurther, cross-corpora evaluations show that the optimal WST hyper-parameters\ndepend on both train and test corpora. Hence, we develop fused ECAPA-TDNN based\nLID systems with different sets of WST hyper-parameters to improve\ngeneralization for unknown data. Compared to MFCC, EER is reduced upto 14.05%\nand 6.40% for same-corpora and blind VoxLingua107 evaluations, respectively.\n","authors":["Spandan Dey","Premjeet Singh","Goutam Saha"],"pdf_url":"https://arxiv.org/pdf/2310.00602v2.pdf","comment":"Accepted and presented in INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2306.01102v5","updated":"2023-10-03T07:43:30Z","published":"2023-06-01T19:33:21Z","title":"LLMatic: Neural Architecture Search via Large Language Models and\n  Quality Diversity Optimization","summary":"  Large Language Models (LLMs) have emerged as powerful tools capable of\naccomplishing a broad spectrum of tasks. Their abilities span numerous areas,\nand one area where they have made a significant impact is in the domain of code\ngeneration. In this context, we view LLMs as mutation and crossover tools.\nMeanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and\nrobust solutions. By merging the code-generating abilities of LLMs with the\ndiversity and robustness of QD solutions, we introduce LLMatic, a Neural\nArchitecture Search (NAS) algorithm. While LLMs struggle to conduct NAS\ndirectly through prompts, LLMatic uses a procedural approach, leveraging QD for\nprompts and network architecture to create diverse and highly performant\nnetworks. We test LLMatic on the CIFAR-10 image classification benchmark,\ndemonstrating that it can produce competitive networks with just $2,000$\nsearches, even without prior knowledge of the benchmark domain or exposure to\nany previous top-performing models for the benchmark.\n","authors":["Muhammad U. Nasir","Sam Earle","Julian Togelius","Steven James","Christopher Cleghorn"],"pdf_url":"https://arxiv.org/pdf/2306.01102v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01854v1","updated":"2023-10-03T07:34:30Z","published":"2023-10-03T07:34:30Z","title":"Fine-tuned vs. Prompt-tuned Supervised Representations: Which Better\n  Account for Brain Language Representations?","summary":"  To decipher the algorithm underlying the human brain's language\nrepresentation, previous work probed brain responses to language input with\npre-trained artificial neural network (ANN) models fine-tuned on NLU tasks.\nHowever, full fine-tuning generally updates the entire parametric space and\ndistorts pre-trained features, cognitively inconsistent with the brain's robust\nmulti-task learning ability. Prompt-tuning, in contrast, protects pre-trained\nweights and learns task-specific embeddings to fit a task. Could prompt-tuning\ngenerate representations that better account for the brain's language\nrepresentations than fine-tuning? If so, what kind of NLU task leads a\npre-trained model to better decode the information represented in the human\nbrain? We investigate these questions by comparing prompt-tuned and fine-tuned\nrepresentations in neural decoding, that is predicting the linguistic stimulus\nfrom the brain activities evoked by the stimulus. We find that on none of the\n10 NLU tasks, full fine-tuning significantly outperforms prompt-tuning in\nneural decoding, implicating that a more brain-consistent tuning method yields\nrepresentations that better correlate with brain data. Moreover, we identify\nthat tasks dealing with fine-grained concept meaning yield representations that\nbetter decode brain activation patterns than other tasks, especially the\nsyntactic chunking task. This indicates that our brain encodes more\nfine-grained concept information than shallow syntactic information when\nrepresenting languages.\n","authors":["Jingyuan Sun","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2310.01854v1.pdf","comment":"IJCAI 2023"},{"id":"http://arxiv.org/abs/2310.01846v1","updated":"2023-10-03T07:23:22Z","published":"2023-10-03T07:23:22Z","title":"Benchmarking and Improving Generator-Validator Consistency of Language\n  Models","summary":"  As of September 2023, ChatGPT correctly answers \"what is 7+8\" with 15, but\nwhen asked \"7+8=15, True or False\" it responds with \"False\". This inconsistency\nbetween generating and validating an answer is prevalent in language models\n(LMs) and erodes trust. In this paper, we propose a framework for measuring the\nconsistency between generation and validation (which we call\ngenerator-validator consistency, or GV-consistency), finding that even GPT-4, a\nstate-of-the-art LM, is GV-consistent only 76% of the time. To improve the\nconsistency of LMs, we propose to finetune on the filtered generator and\nvalidator responses that are GV-consistent, and call this approach consistency\nfine-tuning. We find that this approach improves GV-consistency of Alpaca-30B\nfrom 60% to 93%, and the improvement extrapolates to unseen tasks and domains\n(e.g., GV-consistency for positive style transfers extrapolates to unseen\nstyles like humor). In addition to improving consistency, consistency\nfine-tuning improves both generator quality and validator accuracy without\nusing any labeled data. Evaluated across 6 tasks, including math questions,\nknowledge-intensive QA, and instruction following, our method improves the\ngenerator quality by 16% and the validator accuracy by 6.3% across all tasks.\n","authors":["Xiang Lisa Li","Vaishnavi Shrivastava","Siyan Li","Tatsunori Hashimoto","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2310.01846v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2310.01845v1","updated":"2023-10-03T07:19:59Z","published":"2023-10-03T07:19:59Z","title":"Zero-Shot Refinement of Buildings' Segmentation Models using SAM","summary":"  Foundation models have excelled in various tasks but are often evaluated on\ngeneral benchmarks. The adaptation of these models for specific domains, such\nas remote sensing imagery, remains an underexplored area. In remote sensing,\nprecise building instance segmentation is vital for applications like urban\nplanning. While Convolutional Neural Networks (CNNs) perform well, their\ngeneralization can be limited. For this aim, we present a novel approach to\nadapt foundation models to address existing models' generalization dropback.\nAmong several models, our focus centers on the Segment Anything Model (SAM), a\npotent foundation model renowned for its prowess in class-agnostic image\nsegmentation capabilities. We start by identifying the limitations of SAM,\nrevealing its suboptimal performance when applied to remote sensing imagery.\nMoreover, SAM does not offer recognition abilities and thus fails to classify\nand tag localized objects. To address these limitations, we introduce different\nprompting strategies, including integrating a pre-trained CNN as a prompt\ngenerator. This novel approach augments SAM with recognition abilities, a first\nof its kind. We evaluated our method on three remote sensing datasets,\nincluding the WHU Buildings dataset, the Massachusetts Buildings dataset, and\nthe AICrowd Mapping Challenge. For out-of-distribution performance on the WHU\ndataset, we achieve a 5.47% increase in IoU and a 4.81% improvement in\nF1-score. For in-distribution performance on the WHU dataset, we observe a\n2.72% and 1.58% increase in True-Positive-IoU and True-Positive-F1 score,\nrespectively. We intend to release our code repository, hoping to inspire\nfurther exploration of foundation models for domain-specific tasks within the\nremote sensing community.\n","authors":["Ali Mayladan","Hasan Nasrallah","Hasan Moughnieh","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01839v1","updated":"2023-10-03T07:05:37Z","published":"2023-10-03T07:05:37Z","title":"Preserving Phonemic Distinctions for Ordinal Regression: A Novel Loss\n  Function for Automatic Pronunciation Assessment","summary":"  Automatic pronunciation assessment (APA) manages to quantify the\npronunciation proficiency of a second language (L2) learner in a language.\nPrevailing approaches to APA normally leverage neural models trained with a\nregression loss function, such as the mean-squared error (MSE) loss, for\nproficiency level prediction. Despite most regression models can effectively\ncapture the ordinality of proficiency levels in the feature space, they are\nconfronted with a primary obstacle that different phoneme categories with the\nsame proficiency level are inevitably forced to be close to each other,\nretaining less phoneme-discriminative information. On account of this, we\ndevise a phonemic contrast ordinal (PCO) loss for training regression-based APA\nmodels, which aims to preserve better phonemic distinctions between phoneme\ncategories meanwhile considering ordinal relationships of the regression target\noutput. Specifically, we introduce a phoneme-distinct regularizer into the MSE\nloss, which encourages feature representations of different phoneme categories\nto be far apart while simultaneously pulling closer the representations\nbelonging to the same phoneme category by means of weighted distances. An\nextensive set of experiments carried out on the speechocean762 benchmark\ndataset suggest the feasibility and effectiveness of our model in relation to\nsome existing state-of-the-art models.\n","authors":["Bi-Cheng Yan","Hsin-Wei Wang","Yi-Cheng Wang","Jiun-Ting Li","Chi-Han Lin","Berlin Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01839v1.pdf","comment":"Submitted to ASRU 2023"},{"id":"http://arxiv.org/abs/2310.01837v1","updated":"2023-10-03T07:01:23Z","published":"2023-10-03T07:01:23Z","title":"Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation","summary":"  Current AI-based methods do not provide comprehensible physical\ninterpretations of the utilized data, extracted features, and\npredictions/inference operations. As a result, deep learning models trained\nusing high-resolution satellite imagery lack transparency and explainability\nand can be merely seen as a black box, which limits their wide-level adoption.\nExperts need help understanding the complex behavior of AI models and the\nunderlying decision-making process. The explainable artificial intelligence\n(XAI) field is an emerging field providing means for robust, practical, and\ntrustworthy deployment of AI models. Several XAI techniques have been proposed\nfor image classification tasks, whereas the interpretation of image\nsegmentation remains largely unexplored. This paper offers to bridge this gap\nby adapting the recent XAI classification algorithms and making them usable for\nmuti-class image segmentation, where we mainly focus on buildings' segmentation\nfrom high-resolution satellite images. To benchmark and compare the performance\nof the proposed approaches, we introduce a new XAI evaluation methodology and\nmetric based on \"Entropy\" to measure the model uncertainty. Conventional XAI\nevaluation methods rely mainly on feeding area-of-interest regions from the\nimage back to the pre-trained (utility) model and then calculating the average\nchange in the probability of the target class. Those evaluation metrics lack\nthe needed robustness, and we show that using Entropy to monitor the model\nuncertainty in segmenting the pixels within the target class is more suitable.\nWe hope this work will pave the way for additional XAI research for image\nsegmentation and applications in the remote sensing discipline.\n","authors":["Abdul Karim Gizzini","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01828v1","updated":"2023-10-03T06:51:48Z","published":"2023-10-03T06:51:48Z","title":"Trainable Noise Model as an XAI evaluation method: application on Sobol\n  for remote sensing image segmentation","summary":"  eXplainable Artificial Intelligence (XAI) has emerged as an essential\nrequirement when dealing with mission-critical applications, ensuring\ntransparency and interpretability of the employed black box AI models. The\nsignificance of XAI spans various domains, from healthcare to finance, where\nunderstanding the decision-making process of deep learning algorithms is\nessential. Most AI-based computer vision models are often black boxes; hence,\nproviding explainability of deep neural networks in image processing is crucial\nfor their wide adoption and deployment in medical image analysis, autonomous\ndriving, and remote sensing applications. Recently, several XAI methods for\nimage classification tasks have been introduced. On the contrary, image\nsegmentation has received comparatively less attention in the context of\nexplainability, although it is a fundamental task in computer vision\napplications, especially in remote sensing. Only some research proposes\ngradient-based XAI algorithms for image segmentation. This paper adapts the\nrecent gradient-free Sobol XAI method for semantic segmentation. To measure the\nperformance of the Sobol method for segmentation, we propose a quantitative XAI\nevaluation method based on a learnable noise model. The main objective of this\nmodel is to induce noise on the explanation maps, where higher induced noise\nsignifies low accuracy and vice versa. A benchmark analysis is conducted to\nevaluate and compare performance of three XAI methods, including Seg-Grad-CAM,\nSeg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation\ntechnique. This constitutes the first attempt to run and evaluate XAI methods\nusing high-resolution satellite images.\n","authors":["Hossein Shreim","Abdul Karim Gizzini","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01825v1","updated":"2023-10-03T06:42:28Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":"  Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07505v2","updated":"2023-10-03T06:09:18Z","published":"2023-08-15T00:08:43Z","title":"Data Race Detection Using Large Language Models","summary":"  Large language models (LLMs) are demonstrating significant promise as an\nalternate strategy to facilitate analyses and optimizations of high-performance\ncomputing programs, circumventing the need for resource-intensive manual tool\ncreation. In this paper, we explore a novel LLM-based data race detection\napproach combining prompting engineering and fine-tuning techniques. We create\na dedicated dataset named DRB-ML, which is derived from DataRaceBench, with\nfine-grain labels showing the presence of data race pairs and their associated\nvariables, line numbers, and read/write information. DRB-ML is then used to\nevaluate representative LLMs and fine-tune open-source ones. Our experiment\nshows that LLMs can be a viable approach to data race detection. However, they\nstill cannot compete with traditional data race detection tools when we need\ndetailed information about variable pairs causing data races.\n","authors":["Le Chen","Xianzhong Ding","Murali Emani","Tristan Vanderbruggen","Pei-hung Lin","Chuanhua Liao"],"pdf_url":"https://arxiv.org/pdf/2308.07505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13649v2","updated":"2023-10-03T05:30:36Z","published":"2023-06-23T17:56:26Z","title":"Generalized Knowledge Distillation for Auto-regressive Language Models","summary":"  Knowledge distillation (KD) is widely used for compressing a teacher model to\nreduce its inference cost and memory footprint, by training a smaller student\nmodel. However, current KD methods for auto-regressive sequence models suffer\nfrom distribution mismatch between output sequences seen during training and\nthose generated by the student during inference. To address this issue, we\nintroduce Generalized Knowledge Distillation (GKD). Instead of solely relying\non a fixed set of output sequences, GKD trains the student on its\nself-generated output sequences by leveraging feedback from the teacher on such\nsequences. Unlike supervised KD approaches, GKD also offers the flexibility to\nemploy alternative loss functions between the student and teacher, which can be\nuseful when the student lacks the expressivity to mimic the teacher's\ndistribution. Furthermore, GKD facilitates the seamless integration of\ndistillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for\ndistilling auto-regressive T5 language models on summarization, translation,\nand arithmetic reasoning tasks as well as task-agnostic instruction tuning.\n","authors":["Rishabh Agarwal","Nino Vieillard","Yongchao Zhou","Piotr Stanczyk","Sabela Ramos","Matthieu Geist","Olivier Bachem"],"pdf_url":"https://arxiv.org/pdf/2306.13649v2.pdf","comment":"First two authors contributed equally. Added new results and\n  experiment details"},{"id":"http://arxiv.org/abs/2210.06282v4","updated":"2023-10-03T05:17:54Z","published":"2022-10-12T15:05:28Z","title":"DialoGen: Generalized Long-Range Context Representation for Dialogue\n  Systems","summary":"  Long-range context modeling is crucial to both dialogue understanding and\ngeneration. The most popular method for dialogue context representation is to\nconcatenate the last-$k$ utterances in chronological order. However, this\nmethod may not be ideal for conversations containing long-range dependencies,\ni.e., when there is a need to look beyond last-$k$ utterances to generate a\nmeaningful response. In this work, we propose DialoGen, a novel encoder-decoder\nbased framework for dialogue generation with a generalized context\nrepresentation that can look beyond the last-$k$ utterances. The main idea of\nour approach is to identify and utilize the most relevant historical utterances\ninstead of last-$k$, which also enables the compact representation of dialogue\nhistory with fewer tokens. We study the effectiveness of our proposed method on\nboth dialogue generation (open-domain) and understanding (DST). Even with a\ncompact context representation, DialoGen performs comparably to the\nstate-of-the-art models on the open-domain DailyDialog dataset. We observe a\nsimilar behavior on the DST task of the MultiWOZ dataset when the proposed\ncontext representation is applied to existing DST models. We also discuss the\ngeneralizability and interpretability of DialoGen and show that the relevance\nscore of previous utterances agrees well with human cognition.\n","authors":["Suvodip Dey","Maunendra Sankar Desarkar","Asif Ekbal","P. K. Srijith"],"pdf_url":"https://arxiv.org/pdf/2210.06282v4.pdf","comment":"Accepted at PACLIC 2023"},{"id":"http://arxiv.org/abs/2310.01801v1","updated":"2023-10-03T05:17:08Z","published":"2023-10-03T05:17:08Z","title":"Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs","summary":"  In this study, we introduce adaptive KV cache compression, a plug-and-play\nmethod that reduces the memory footprint of generative inference for Large\nLanguage Models (LLMs). Different from the conventional KV cache that retains\nkey and value vectors for all context tokens, we conduct targeted profiling to\ndiscern the intrinsic structure of attention modules. Based on the recognized\nstructure, we then construct the KV cache in an adaptive manner: evicting\nlong-range contexts on attention heads emphasizing local contexts, discarding\nnon-special tokens on attention heads centered on special tokens, and only\nemploying the standard KV cache for attention heads that broadly attend to all\ntokens. Moreover, with the lightweight attention profiling used to guide the\nconstruction of the adaptive KV cache, FastGen can be deployed without\nresource-intensive fine-tuning or re-training. In our experiments across\nvarious asks, FastGen demonstrates substantial reduction on GPU memory\nconsumption with negligible generation quality loss. We will release our code\nand the compatible CUDA kernel for reproducibility.\n","authors":["Suyu Ge","Yunan Zhang","Liyuan Liu","Minjia Zhang","Jiawei Han","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.01801v1.pdf","comment":"Under Review; To be updated"},{"id":"http://arxiv.org/abs/2309.17415v2","updated":"2023-10-03T05:16:36Z","published":"2023-09-29T17:26:03Z","title":"Intuitive or Dependent? Investigating LLMs' Robustness to Conflicting\n  Prompts","summary":"  This paper explores the robustness of LLMs' preference to their internal\nmemory or the given prompt, which may contain contrasting information in\nreal-world applications due to noise or task settings. To this end, we\nestablish a quantitative benchmarking framework and conduct the role playing\nintervention to control LLMs' preference. In specific, we define two types of\nrobustness, factual robustness targeting the ability to identify the correct\nfact from prompts or memory, and decision style to categorize LLMs' behavior in\nmaking consistent choices -- assuming there is no definitive \"right\" answer --\nintuitive, dependent, or rational based on cognitive theory. Our findings,\nderived from extensive experiments on seven open-source and closed-source LLMs,\nreveal that these models are highly susceptible to misleading prompts,\nespecially for instructing commonsense knowledge. While detailed instructions\ncan mitigate the selection of misleading answers, they also increase the\nincidence of invalid responses. After Unraveling the preference, we intervene\ndifferent sized LLMs through specific style of role instruction, showing their\nvarying upper bound of robustness and adaptivity.\n","authors":["Jiahao Ying","Yixin Cao","Kai Xiong","Yidong He","Long Cui","Yongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.17415v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02676v7","updated":"2023-10-03T05:04:47Z","published":"2023-02-06T10:28:16Z","title":"Chain of Hindsight Aligns Language Models with Feedback","summary":"  Learning from human preferences is important for language models to match\nhuman needs and to align with human and social values. Prior works have\nachieved remarkable successes by learning from human feedback to understand and\nfollow instructions. Nonetheless, these methods are either founded on\nhand-picked model generations that are favored by human annotators, rendering\nthem inefficient in terms of data utilization and challenging to apply in\ngeneral, or they depend on reinforcement learning, which often suffers from\nimperfect reward functions and relies on extremely challenging optimizations.\nIn this work, we propose a novel technique, Chain of Hindsight, that is easy to\noptimize and can learn from any form of feedback, regardless of its polarity.\nOur idea is inspired by how humans learn from extensive feedback presented in\nthe form of languages. We convert all types of feedback into sequences of\nsentences, which are then used to fine-tune the model, allowing us to take\nadvantage of the language comprehension capabilities of language models. We\ncondition the model on a sequence of model generations paired with feedback. By\ndoing so, the model is trained to generate outputs based on feedback, while\nlearning to identify and correct negative attributes or errors. Applying our\nmethod to large language models, we observed that Chain of Hindsight\nsignificantly surpasses previous methods in aligning language models with human\npreferences. We report significant improvements on summarization and dialogue\nbenchmarks, with our approach markedly preferred in human evaluations.\n","authors":["Hao Liu","Carmelo Sferrazza","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2302.02676v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01798v1","updated":"2023-10-03T04:56:12Z","published":"2023-10-03T04:56:12Z","title":"Large Language Models Cannot Self-Correct Reasoning Yet","summary":"  Large Language Models (LLMs) have emerged as a groundbreaking technology with\ntheir unparalleled text generation capabilities across various applications.\nNevertheless, concerns persist regarding the accuracy and appropriateness of\ntheir generated content. A contemporary methodology, self-correction, has been\nproposed as a remedy to these issues. Building upon this premise, this paper\ncritically examines the role and efficacy of self-correction within LLMs,\nshedding light on its true potential and limitations. Central to our\ninvestigation is the notion of intrinsic self-correction, whereby an LLM\nattempts to correct its initial responses based solely on its inherent\ncapabilities, without the crutch of external feedback. In the context of\nreasoning, our research indicates that LLMs struggle to self-correct their\nresponses without external feedback, and at times, their performance might even\ndegrade post self-correction. Drawing from these insights, we offer suggestions\nfor future research and practical applications in this field.\n","authors":["Jie Huang","Xinyun Chen","Swaroop Mishra","Huaixiu Steven Zheng","Adams Wei Yu","Xinying Song","Denny Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.01798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04195v3","updated":"2023-10-03T04:42:09Z","published":"2023-05-07T05:40:48Z","title":"Cross-Modal Retrieval for Motion and Text via DopTriple Loss","summary":"  Cross-modal retrieval of image-text and video-text is a prominent research\narea in computer vision and natural language processing. However, there has\nbeen insufficient attention given to cross-modal retrieval between human motion\nand text, despite its wide-ranging applicability. To address this gap, we\nutilize a concise yet effective dual-unimodal transformer encoder for tackling\nthis task. Recognizing that overlapping atomic actions in different human\nmotion sequences can lead to semantic conflicts between samples, we explore a\nnovel triplet loss function called DropTriple Loss. This loss function discards\nfalse negative samples from the negative sample set and focuses on mining\nremaining genuinely hard negative samples for triplet training, thereby\nreducing violations they cause. We evaluate our model and approach on the\nHumanML3D and KIT Motion-Language datasets. On the latest HumanML3D dataset, we\nachieve a recall of 62.9% for motion retrieval and 71.5% for text retrieval\n(both based on R@10). The source code for our approach is publicly available at\nhttps://github.com/eanson023/rehamot.\n","authors":["Sheng Yan","Yang Liu","Haoqiang Wang","Xin Du","Mengyuan Liu","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2305.04195v3.pdf","comment":"This paper is accepted by ACM MM Asia 2023"},{"id":"http://arxiv.org/abs/2310.00535v2","updated":"2023-10-03T04:23:26Z","published":"2023-10-01T01:21:35Z","title":"JoMA: Demystifying Multilayer Transformers via JOint Dynamics of MLP and\n  Attention","summary":"  We propose Joint MLP/Attention (JoMA) dynamics, a novel mathematical\nframework to understand the training procedure of multilayer Transformer\narchitectures. This is achieved by integrating out the self-attention layer in\nTransformers, producing a modified dynamics of MLP layers only. JoMA removes\nunrealistic assumptions in previous analysis (e.g., lack of residual\nconnection) and predicts that the attention first becomes sparse (to learn\nsalient tokens), then dense (to learn less salient tokens) in the presence of\nnonlinear activations, while in the linear case, it is consistent with existing\nworks that show attention becomes sparse over time. We leverage JoMA to\nqualitatively explains how tokens are combined to form hierarchies in\nmultilayer Transformers, when the input tokens are generated by a latent\nhierarchical generative model. Experiments on models trained from real-world\ndataset (Wikitext2/Wikitext103) and various pre-trained models (OPT, Pythia)\nverify our theoretical findings.\n","authors":["Yuandong Tian","Yiping Wang","Zhenyu Zhang","Beidi Chen","Simon Du"],"pdf_url":"https://arxiv.org/pdf/2310.00535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01783v1","updated":"2023-10-03T04:14:17Z","published":"2023-10-03T04:14:17Z","title":"Can large language models provide useful feedback on research papers? A\n  large-scale empirical analysis","summary":"  Expert feedback lays the foundation of rigorous research. However, the rapid\ngrowth of scholarly production and intricate knowledge specialization challenge\nthe conventional scientific feedback mechanisms. High-quality peer reviews are\nincreasingly difficult to obtain. Researchers who are more junior or from\nunder-resourced settings have especially hard times getting timely feedback.\nWith the breakthrough of large language models (LLM) such as GPT-4, there is\ngrowing interest in using LLMs to generate scientific feedback on research\nmanuscripts. However, the utility of LLM-generated feedback has not been\nsystematically studied. To address this gap, we created an automated pipeline\nusing GPT-4 to provide comments on the full PDFs of scientific papers. We\nevaluated the quality of GPT-4's feedback through two large-scale studies. We\nfirst quantitatively compared GPT-4's generated feedback with human peer\nreviewer feedback in 15 Nature family journals (3,096 papers in total) and the\nICLR machine learning conference (1,709 papers). The overlap in the points\nraised by GPT-4 and by human reviewers (average overlap 30.85% for Nature\njournals, 39.23% for ICLR) is comparable to the overlap between two human\nreviewers (average overlap 28.58% for Nature journals, 35.25% for ICLR). The\noverlap between GPT-4 and human reviewers is larger for the weaker papers. We\nthen conducted a prospective user study with 308 researchers from 110 US\ninstitutions in the field of AI and computational biology to understand how\nresearchers perceive feedback generated by our GPT-4 system on their own\npapers. Overall, more than half (57.4%) of the users found GPT-4 generated\nfeedback helpful/very helpful and 82.4% found it more beneficial than feedback\nfrom at least some human reviewers. While our findings show that LLM-generated\nfeedback can help researchers, we also identify several limitations.\n","authors":["Weixin Liang","Yuhui Zhang","Hancheng Cao","Binglu Wang","Daisy Ding","Xinyu Yang","Kailas Vodrahalli","Siyu He","Daniel Smith","Yian Yin","Daniel McFarland","James Zou"],"pdf_url":"https://arxiv.org/pdf/2310.01783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01777v1","updated":"2023-10-03T03:56:26Z","published":"2023-10-03T03:56:26Z","title":"SEA: Sparse Linear Attention with Estimated Attention Mask","summary":"  The transformer architecture has made breakthroughs in recent years on tasks\nwhich require modeling pairwise relationships between sequential elements, as\nis the case in natural language understanding. However, transformers struggle\nwith long sequences due to the quadratic complexity of the attention operation,\nand previous research has aimed to lower the complexity by sparsifying or\nlinearly approximating the attention matrix. Yet, these approaches cannot\nstraightforwardly distill knowledge from a teacher's attention matrix, and\noften require complete retraining from scratch. Furthermore, previous sparse\nand linear approaches may also lose interpretability if they do not produce\nfull quadratic attention matrices. To address these challenges, we propose SEA:\nSparse linear attention with an Estimated Attention mask. SEA estimates the\nattention matrix with linear complexity via kernel-based linear attention, then\ncreates a sparse approximation to the full attention matrix with a top-k\nselection to perform a sparse attention operation. For language modeling tasks\n(Wikitext2), previous linear and sparse attention methods show a roughly\ntwo-fold worse perplexity scores over the quadratic OPT-125M baseline, while\nSEA achieves an even better perplexity than OPT-125M, using roughly half as\nmuch memory as OPT-125M. Moreover, SEA maintains an interpretable attention\nmatrix and can utilize knowledge distillation to lower the complexity of\nexisting pretrained transformers. We believe that our work will have a large\npractical impact, as it opens the possibility of running large transformers on\nresource-limited devices with less memory.\n","authors":["Heejun Lee","Jina Kim","Jeffrey Willette","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.01777v1.pdf","comment":"9 main pages"},{"id":"http://arxiv.org/abs/2307.12856v3","updated":"2023-10-03T03:51:14Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web automation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that learns from self-experience to\ncomplete tasks on real websites following natural language instructions.\nWebAgent plans ahead by decomposing instructions into canonical\nsub-instructions, summarizes long HTML documents into task-relevant snippets,\nand acts on websites via Python programs generated from those. We design\nWebAgent with Flan-U-PaLM, for grounded code generation, and HTML-T5, new\npre-trained LLMs for long HTML documents using local and global attention\nmechanisms and a mixture of long-span denoising objectives, for planning and\nsummarization. We empirically demonstrate that our modular recipe improves the\nsuccess on real websites by over 50%, and that HTML-T5 is the best model to\nsolve various HTML understanding tasks; achieving 18.7% higher success rate\nthan the prior method on MiniWoB web automation benchmark, and SoTA performance\non Mind2Web, an offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12091v2","updated":"2023-10-03T03:33:20Z","published":"2023-05-20T04:43:26Z","title":"\"What do others think?\": Task-Oriented Conversational Modeling with\n  Subjective Knowledge","summary":"  Task-oriented Dialogue (TOD) Systems aim to build dialogue systems that\nassist users in accomplishing specific goals, such as booking a hotel or a\nrestaurant. Traditional TODs rely on domain-specific APIs/DBs or external\nfactual knowledge to generate responses, which cannot accommodate subjective\nuser requests (e.g., \"Is the WIFI reliable?\" or \"Does the restaurant have a\ngood atmosphere?\"). To address this issue, we propose a novel task of\nsubjective-knowledge-based TOD (SK-TOD). We also propose the first\ncorresponding dataset, which contains subjective knowledge-seeking dialogue\ncontexts and manually annotated responses grounded in subjective knowledge\nsources. When evaluated with existing TOD approaches, we find that this task\nposes new challenges such as aggregating diverse opinions from multiple\nknowledge snippets. We hope this task and dataset can promote further research\non TOD and subjective content understanding. The code and the dataset are\navailable at https://github.com/alexa/dstc11-track5.\n","authors":["Chao Zhao","Spandana Gella","Seokhwan Kim","Di Jin","Devamanyu Hazarika","Alexandros Papangelis","Behnam Hedayatnia","Mahdi Namazifar","Yang Liu","Dilek Hakkani-Tur"],"pdf_url":"https://arxiv.org/pdf/2305.12091v2.pdf","comment":"SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2309.05653v3","updated":"2023-10-03T02:48:42Z","published":"2023-09-11T17:47:22Z","title":"MAmmoTH: Building Math Generalist Models through Hybrid Instruction\n  Tuning","summary":"  We introduce MAmmoTH, a series of open-source large language models (LLMs)\nspecifically tailored for general math problem-solving. The MAmmoTH models are\ntrained on MathInstruct, our meticulously curated instruction tuning dataset.\nMathInstruct is compiled from 13 math datasets with intermediate rationales,\nsix of which have rationales newly curated by us. It presents a unique hybrid\nof chain-of-thought (CoT) and program-of-thought (PoT) rationales, and also\nensures extensive coverage of diverse fields in math. The hybrid of CoT and PoT\nnot only unleashes the potential of tool use but also allows different thought\nprocesses for different math problems. As a result, the MAmmoTH series\nsubstantially outperform existing open-source models on nine mathematical\nreasoning datasets across all scales with an average accuracy gain between 16%\nand 32%. Remarkably, our MAmmoTH-7B model reaches 33% on MATH (a\ncompetition-level dataset), which exceeds the best open-source 7B model\n(WizardMath) by 23%, and the MAmmoTH-34B model achieves 44% accuracy on MATH,\neven surpassing GPT-4's CoT result. Our work underscores the importance of\ndiverse problem coverage and the use of hybrid rationales in developing\nsuperior math generalist models.\n","authors":["Xiang Yue","Xingwei Qu","Ge Zhang","Yao Fu","Wenhao Huang","Huan Sun","Yu Su","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05653v3.pdf","comment":"Work in progress; Xiang Yue and Wenhu Chen contributed equally to\n  this paper"},{"id":"http://arxiv.org/abs/2305.13300v3","updated":"2023-10-03T02:47:41Z","published":"2023-05-22T17:57:41Z","title":"Adaptive Chameleon or Stubborn Sloth: Revealing the Behavior of Large\n  Language Models in Knowledge Conflicts","summary":"  By providing external information to large language models (LLMs), tool\naugmentation (including retrieval augmentation) has emerged as a promising\nsolution for addressing the limitations of LLMs' static parametric memory.\nHowever, how receptive are LLMs to such external evidence, especially when the\nevidence conflicts with their parametric memory? We present the first\ncomprehensive and controlled investigation into the behavior of LLMs when\nencountering knowledge conflicts. We propose a systematic framework to elicit\nhigh-quality parametric memory from LLMs and construct the corresponding\ncounter-memory, which enables us to conduct a series of controlled experiments.\nOur investigation reveals seemingly contradicting behaviors of LLMs. On the one\nhand, different from prior wisdom, we find that LLMs can be highly receptive to\nexternal evidence even when that conflicts with their parametric memory, given\nthat the external evidence is coherent and convincing. On the other hand, LLMs\nalso demonstrate a strong confirmation bias when the external evidence contains\nsome information that is consistent with their parametric memory, despite being\npresented with conflicting evidence at the same time. These results pose\nimportant implications that are worth careful consideration for the further\ndevelopment and deployment of tool- and retrieval-augmented LLMs.\n","authors":["Jian Xie","Kai Zhang","Jiangjie Chen","Renze Lou","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2305.13300v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.16137v4","updated":"2023-10-03T02:46:50Z","published":"2023-08-30T16:47:51Z","title":"LM-Infinite: Simple On-the-Fly Length Generalization for Large Language\n  Models","summary":"  In recent years, there have been remarkable advancements in the performance\nof Transformer-based Large Language Models (LLMs) across various domains. As\nthese LLMs are deployed for increasingly complex domains, they often face the\nneed to follow longer user prompts or generate longer texts. In these\nsituations, the $\\textit{length generalization failure}$ of LLMs on long\nsequences becomes more prominent. Most pre-training schemes truncate training\nsequences to a fixed length. LLMs often struggle to generate fluent and\ncoherent texts after longer contexts, even with relative positional encoding\nspecifically designed to cope with this problem. Common solutions such as\nfinetuning on longer corpora often involve daunting hardware and time costs and\nrequire careful training process design. To more efficiently extrapolate\nexisting LLMs' generation quality to longer texts, we theoretically and\nempirically investigate the main out-of-distribution (OOD) factors contributing\nto this problem. Inspired by this diagnosis, we propose a simple yet effective\nsolution for on-the-fly length generalization, LM-Infinite. It involves only a\n$\\mathbf{\\Lambda}$-shaped attention mask (to avoid excessive attended tokens)\nand a distance limit (to avoid unseen distances) while requiring no parameter\nupdates or learning. We find it applicable to a variety of LLMs using\nrelative-position encoding methods. LM-Infinite is computationally efficient\nwith $O(n)$ time and space, and demonstrates consistent text generation fluency\nand quality to as long as 128k tokens on ArXiv and OpenWebText2 datasets, with\n2.72x decoding speedup. We will make the codes publicly available following\npublication.\n","authors":["Chi Han","Qifan Wang","Wenhan Xiong","Yu Chen","Heng Ji","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16137v4.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.01749v1","updated":"2023-10-03T02:18:06Z","published":"2023-10-03T02:18:06Z","title":"Stack Attention: Improving the Ability of Transformers to Model\n  Hierarchical Patterns","summary":"  Attention, specifically scaled dot-product attention, has proven effective\nfor natural language, but it does not have a mechanism for handling\nhierarchical patterns of arbitrary nesting depth, which limits its ability to\nrecognize certain syntactic structures. To address this shortcoming, we propose\nstack attention: an attention operator that incorporates stacks, inspired by\ntheir theoretical connections to context-free languages (CFLs). We show that\nstack attention is analogous to standard attention, but with a latent model of\nsyntax that requires no syntactic supervision. We propose two variants: one\nrelated to deterministic pushdown automata (PDAs) and one based on\nnondeterministic PDAs, which allows transformers to recognize arbitrary CFLs.\nWe show that transformers with stack attention are very effective at learning\nCFLs that standard transformers struggle on, achieving strong results on a CFL\nwith theoretically maximal parsing difficulty. We also show that stack\nattention is more effective at natural language modeling under a constrained\nparameter budget, and we include results on machine translation.\n","authors":["Brian DuSell","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2310.01749v1.pdf","comment":"17 pages, 2 figures"},{"id":"http://arxiv.org/abs/2304.08612v2","updated":"2023-10-03T02:07:30Z","published":"2023-04-17T20:59:49Z","title":"Bridging Discrete and Backpropagation: Straight-Through and Beyond","summary":"  Backpropagation, the cornerstone of deep learning, is limited to computing\ngradients for continuous variables. This limitation poses challenges for\nproblems involving discrete latent variables. To address this issue, we propose\na novel approach to approximate the gradient of parameters involved in\ngenerating discrete latent variables. First, we examine the widely used\nStraight-Through (ST) heuristic and demonstrate that it works as a first-order\napproximation of the gradient. Guided by our findings, we propose ReinMax,\nwhich achieves second-order accuracy by integrating Heun's method, a\nsecond-order numerical method for solving ODEs. ReinMax does not require\nHessian or other second-order derivatives, thus having negligible computation\noverheads. Extensive experimental results on various tasks demonstrate the\nsuperiority of ReinMax over the state of the art. Implementations are released\nat https://github.com/microsoft/ReinMax.\n","authors":["Liyuan Liu","Chengyu Dong","Xiaodong Liu","Bin Yu","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2304.08612v2.pdf","comment":"NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.01732v1","updated":"2023-10-03T01:47:49Z","published":"2023-10-03T01:47:49Z","title":"Nugget: Neural Agglomerative Embeddings of Text","summary":"  Embedding text sequences is a widespread requirement in modern language\nunderstanding. Existing approaches focus largely on constant-size\nrepresentations. This is problematic, as the amount of information contained in\ntext often varies with the length of the input. We propose a solution called\nNugget, which encodes language into a representation based on a dynamically\nselected subset of input tokens. These nuggets are learned through tasks like\nautoencoding and machine translation, and intuitively segment language into\nmeaningful units. We demonstrate Nugget outperforms related approaches in tasks\ninvolving semantic comparison. Finally, we illustrate these compact units allow\nfor expanding the contextual window of a language model (LM), suggesting new\nfuture LMs that can condition on significantly larger amounts of content.\n","authors":["Guanghui Qin","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2310.01732v1.pdf","comment":"Appeared at ICML 2023"},{"id":"http://arxiv.org/abs/2307.02738v3","updated":"2023-10-03T01:16:33Z","published":"2023-07-06T02:51:54Z","title":"RecallM: An Adaptable Memory Mechanism with Temporal Understanding for\n  Large Language Models","summary":"  Large Language Models (LLMs) have made extraordinary progress in the field of\nArtificial Intelligence and have demonstrated remarkable capabilities across a\nlarge variety of tasks and domains. However, as we venture closer to creating\nArtificial General Intelligence (AGI) systems, we recognize the need to\nsupplement LLMs with long-term memory to overcome the context window limitation\nand more importantly, to create a foundation for sustained reasoning,\ncumulative learning and long-term user interaction. In this paper we propose\nRecallM, a novel architecture for providing LLMs with an adaptable and\nupdatable long-term memory mechanism. Unlike previous methods, the RecallM\narchitecture is particularly effective at belief updating and maintaining a\ntemporal understanding of the knowledge provided to it. We demonstrate through\nvarious experiments the effectiveness of this architecture. Furthermore,\nthrough our own temporal understanding and belief updating experiments, we show\nthat RecallM is four times more effective than using a vector database for\nupdating knowledge previously stored in long-term memory. We also demonstrate\nthat RecallM shows competitive performance on general question-answering and\nin-context learning tasks.\n","authors":["Brandon Kynoch","Hugo Latapie","Dwane van der Sluis"],"pdf_url":"https://arxiv.org/pdf/2307.02738v3.pdf","comment":"8 pages, 7 figures, 1 table, Our code is publicly available online\n  at: https://github.com/cisco-open/DeepVision/tree/main/recallm"},{"id":"http://arxiv.org/abs/2310.01717v1","updated":"2023-10-03T01:02:44Z","published":"2023-10-03T01:02:44Z","title":"Ensemble Distillation for Unsupervised Constituency Parsing","summary":"  We investigate the unsupervised constituency parsing task, which organizes\nwords and phrases of a sentence into a hierarchical structure without using\nlinguistically annotated data. We observe that existing unsupervised parsers\ncapture differing aspects of parsing structures, which can be leveraged to\nenhance unsupervised parsing performance. To this end, we propose a notion of\n\"tree averaging,\" based on which we further propose a novel ensemble method for\nunsupervised parsing. To improve inference efficiency, we further distill the\nensemble knowledge into a student model; such an ensemble-then-distill process\nis an effective approach to mitigate the over-smoothing problem existing in\ncommon multi-teacher distilling methods. Experiments show that our method\nsurpasses all previous approaches, consistently demonstrating its effectiveness\nand robustness across various runs, with different ensemble components, and\nunder domain-shift conditions.\n","authors":["Behzad Shayegh","Yanshuai Cao","Xiaodan Zhu","Jackie C. K. Cheung","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2310.01717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01708v1","updated":"2023-10-03T00:08:23Z","published":"2023-10-03T00:08:23Z","title":"Deciphering Diagnoses: How Large Language Models Explanations Influence\n  Clinical Decision Making","summary":"  Clinical Decision Support Systems (CDSS) utilize evidence-based knowledge and\npatient data to offer real-time recommendations, with Large Language Models\n(LLMs) emerging as a promising tool to generate plain-text explanations for\nmedical decisions. This study explores the effectiveness and reliability of\nLLMs in generating explanations for diagnoses based on patient complaints.\nThree experienced doctors evaluated LLM-generated explanations of the\nconnection between patient complaints and doctor and model-assigned diagnoses\nacross several stages. Experimental results demonstrated that LLM explanations\nsignificantly increased doctors' agreement rates with given diagnoses and\nhighlighted potential errors in LLM outputs, ranging from 5% to 30%. The study\nunderscores the potential and challenges of LLMs in healthcare and emphasizes\nthe need for careful integration and evaluation to ensure patient safety and\noptimal clinical utility.\n","authors":["D. Umerenkov","G. Zubkova","A. Nesterov"],"pdf_url":"https://arxiv.org/pdf/2310.01708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02489v1","updated":"2023-10-03T23:31:48Z","published":"2023-10-03T23:31:48Z","title":"ResidualTransformer: Residual Low-rank Learning with Weight-sharing for\n  Transformer Layers","summary":"  Memory constraint of always-on devices is one of the major concerns when\ndeploying speech processing models on these devices. While larger models\ntrained with sufficiently large amount of data generally perform better, making\nthem fit in the device memory is a demanding challenge. In this paper, we aim\nto reduce model size by reparameterizing model weights across Transformer\nencoder layers and assuming a special weight composition and structure. More\nspecifically, inspired by ResNet and the more recent LoRA work, we propose an\napproach named ResidualTransformer, where each weight matrix in a Transformer\nlayer comprises 1) a shared full-rank component with its adjacent layers, and\n2) a unique low-rank component to itself. The low-rank matrices only account\nfor a small amount of model size increase. In addition, we add diagonal weight\nmatrices to improve modeling capacity of the low-rank matrices. Experiments of\nour 10k-hour speech recognition and speech translation tasks show that the\nTransformer encoder size can be reduced by ~3X with very slight performance\ndegradation.\n","authors":["Yiming Wang","Jinyu Li"],"pdf_url":"https://arxiv.org/pdf/2310.02489v1.pdf","comment":"Submitted to IEEE ICASSP 2024. 5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2305.14779v2","updated":"2023-10-03T23:01:05Z","published":"2023-05-24T06:35:26Z","title":"Alt-Text with Context: Improving Accessibility for Images on Twitter","summary":"  In this work we present an approach for generating alternative text (or\nalt-text) descriptions for images shared on social media, specifically Twitter.\nMore than just a special case of image captioning, alt-text is both more\nliterally descriptive and context-specific. Also critically, images posted to\nTwitter are often accompanied by user-written text that despite not necessarily\ndescribing the image may provide useful context that if properly leveraged can\nbe informative. We address this task with a multimodal model that conditions on\nboth textual information from the associated social media post as well as\nvisual signal from the image, and demonstrate that the utility of these two\ninformation sources stacks. We put forward a new dataset of 371k images paired\nwith alt-text and tweets scraped from Twitter and evaluate on it across a\nvariety of automated metrics as well as human evaluation. We show that our\napproach of conditioning on both tweet text and visual information\nsignificantly outperforms prior work, by more than 2x on BLEU@4.\n","authors":["Nikita Srivatsan","Sofia Samaniego","Omar Florez","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2305.14779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02469v1","updated":"2023-10-03T22:37:01Z","published":"2023-10-03T22:37:01Z","title":"Large Language Models Can Be Good Privacy Protection Learners","summary":"  The proliferation of Large Language Models (LLMs) has driven considerable\ninterest in fine-tuning them with domain-specific data to create specialized\nlanguage models. Nevertheless, such domain-specific fine-tuning data often\ncontains sensitive personally identifiable information (PII). Direct\nfine-tuning LLMs on this data without privacy protection poses a risk of\nleakage. To address this challenge, we introduce Privacy Protection Language\nModels (PPLM), a novel paradigm for fine-tuning LLMs that effectively injects\ndomain-specific knowledge while safeguarding data privacy. Our work offers a\ntheoretical analysis for model design and delves into various techniques such\nas corpus curation, penalty-based unlikelihood in training loss, and\ninstruction-based tuning, etc. Extensive experiments across diverse datasets\nand scenarios demonstrate the effectiveness of our approaches. In particular,\ninstruction tuning with both positive and negative examples, stands out as a\npromising method, effectively protecting private data while enhancing the\nmodel's knowledge. Our work underscores the potential for Large Language Models\nas robust privacy protection learners.\n","authors":["Yijia Xiao","Yiqiao Jin","Yushi Bai","Yue Wu","Xianjun Yang","Xiao Luo","Wenchao Yu","Xujiang Zhao","Yanchi Liu","Haifeng Chen","Wei Wang","Wei Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.02469v1.pdf","comment":"20 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2302.04511v2","updated":"2023-10-03T22:27:03Z","published":"2023-02-09T09:08:19Z","title":"A Large-Scale Analysis of Persian Tweets Regarding Covid-19 Vaccination","summary":"  The Covid-19 pandemic had an enormous effect on our lives, especially on\npeople's interactions. By introducing Covid-19 vaccines, both positive and\nnegative opinions were raised over the subject of taking vaccines or not. In\nthis paper, using data gathered from Twitter, including tweets and user\nprofiles, we offer a comprehensive analysis of public opinion in Iran about the\nCoronavirus vaccines. For this purpose, we applied a search query technique\ncombined with a topic modeling approach to extract vaccine-related tweets. We\nutilized transformer-based models to classify the content of the tweets and\nextract themes revolving around vaccination. We also conducted an emotion\nanalysis to evaluate the public happiness and anger around this topic. Our\nresults demonstrate that Covid-19 vaccination has attracted considerable\nattention from different angles, such as governmental issues, safety or\nhesitancy, and side effects. Moreover, Coronavirus-relevant phenomena like\npublic vaccination and the rate of infection deeply impacted public emotional\nstatus and users' interactions.\n","authors":["Taha ShabaniMirzaei","Houmaan Chamani","Amirhossein Abaskohi","Zhivar Sourati Hassan Zadeh","Behnam Bahrak"],"pdf_url":"https://arxiv.org/pdf/2302.04511v2.pdf","comment":"10 figures"},{"id":"http://arxiv.org/abs/2310.02457v1","updated":"2023-10-03T22:02:17Z","published":"2023-10-03T22:02:17Z","title":"The Empty Signifier Problem: Towards Clearer Paradigms for\n  Operationalising \"Alignment\" in Large Language Models","summary":"  In this paper, we address the concept of \"alignment\" in large language models\n(LLMs) through the lens of post-structuralist socio-political theory,\nspecifically examining its parallels to empty signifiers. To establish a shared\nvocabulary around how abstract concepts of alignment are operationalised in\nempirical datasets, we propose a framework that demarcates: 1) which dimensions\nof model behaviour are considered important, then 2) how meanings and\ndefinitions are ascribed to these dimensions, and by whom. We situate existing\nempirical literature and provide guidance on deciding which paradigm to follow.\nThrough this framework, we aim to foster a culture of transparency and critical\nevaluation, aiding the community in navigating the complexities of aligning\nLLMs with human populations.\n","authors":["Hannah Rose Kirk","Bertie Vidgen","Paul Röttger","Scott A. Hale"],"pdf_url":"https://arxiv.org/pdf/2310.02457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02451v1","updated":"2023-10-03T21:40:44Z","published":"2023-10-03T21:40:44Z","title":"Backdoor Adjustment of Confounding by Provenance for Robust Text\n  Classification of Multi-institutional Clinical Notes","summary":"  Natural Language Processing (NLP) methods have been broadly applied to\nclinical tasks. Machine learning and deep learning approaches have been used to\nimprove the performance of clinical NLP. However, these approaches require\nsufficiently large datasets for training, and trained models have been shown to\ntransfer poorly across sites. These issues have led to the promotion of data\ncollection and integration across different institutions for accurate and\nportable models. However, this can introduce a form of bias called confounding\nby provenance. When source-specific data distributions differ at deployment,\nthis may harm model performance. To address this issue, we evaluate the utility\nof backdoor adjustment for text classification in a multi-site dataset of\nclinical notes annotated for mentions of substance abuse. Using an evaluation\nframework devised to measure robustness to distributional shifts, we assess the\nutility of backdoor adjustment. Our results indicate that backdoor adjustment\ncan effectively mitigate for confounding shift.\n","authors":["Xiruo Ding","Zhecheng Sheng","Meliha Yetişgen","Serguei Pakhomov","Trevor Cohen"],"pdf_url":"https://arxiv.org/pdf/2310.02451v1.pdf","comment":"Accepted in AMIA 2023 Annual Symposium"},{"id":"http://arxiv.org/abs/2301.13310v2","updated":"2023-10-03T21:40:41Z","published":"2023-01-30T22:06:05Z","title":"Alternating Updates for Efficient Transformers","summary":"  It has been well established that increasing scale in deep transformer\nnetworks leads to improved quality and performance. However, this increase in\nscale often comes with prohibitive increases in compute cost and inference\nlatency. We introduce Alternating Updates (AltUp), a simple-to-implement method\nto increase a model's capacity without the computational burden. AltUp enables\nthe widening of the learned representation, i.e., the token embedding, while\nonly incurring a negligible increase in latency. AltUp achieves this by working\non a subblock of the widened representation at each layer and using a\npredict-and-correct mechanism to update the inactivated blocks. We present\nextensions of AltUp, such as its applicability to the sequence dimension, and\ndemonstrate how AltUp can be synergistically combined with existing approaches,\nsuch as Sparse Mixture-of-Experts models, to obtain efficient models with even\nhigher capacity. Our experiments on benchmark transformer models and language\ntasks demonstrate the consistent effectiveness of AltUp on a diverse set of\nscenarios. Notably, on SuperGLUE and SQuAD benchmarks, AltUp enables up to\n$87\\%$ speedup relative to the dense baselines at the same accuracy.\n","authors":["Cenk Baykal","Dylan Cutler","Nishanth Dikkala","Nikhil Ghosh","Rina Panigrahy","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2301.13310v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02446v1","updated":"2023-10-03T21:30:56Z","published":"2023-10-03T21:30:56Z","title":"Low-Resource Languages Jailbreak GPT-4","summary":"  AI safety training and red-teaming of large language models (LLMs) are\nmeasures to mitigate the generation of unsafe content. Our work exposes the\ninherent cross-lingual vulnerability of these safety mechanisms, resulting from\nthe linguistic inequality of safety training data, by successfully\ncircumventing GPT-4's safeguard through translating unsafe English inputs into\nlow-resource languages. On the AdvBenchmark, GPT-4 engages with the unsafe\ntranslated inputs and provides actionable items that can get the users towards\ntheir harmful goals 79% of the time, which is on par with or even surpassing\nstate-of-the-art jailbreaking attacks. Other high-/mid-resource languages have\nsignificantly lower attack success rate, which suggests that the cross-lingual\nvulnerability mainly applies to low-resource languages. Previously, limited\ntraining on low-resource languages primarily affects speakers of those\nlanguages, causing technological disparities. However, our work highlights a\ncrucial shift: this deficiency now poses a risk to all LLMs users. Publicly\navailable translation APIs enable anyone to exploit LLMs' safety\nvulnerabilities. Therefore, our work calls for a more holistic red-teaming\nefforts to develop robust multilingual safeguards with wide language coverage.\n","authors":["Zheng-Xin Yong","Cristina Menghini","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2310.02446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02439v1","updated":"2023-10-03T21:19:50Z","published":"2023-10-03T21:19:50Z","title":"Novice Learner and Expert Tutor: Evaluating Math Reasoning Abilities of\n  Large Language Models with Misconceptions","summary":"  We propose novel evaluations for mathematical reasoning capabilities of Large\nLanguage Models (LLMs) based on mathematical misconceptions. Our primary\napproach is to simulate LLMs as a novice learner and an expert tutor, aiming to\nidentify the incorrect answer to math question resulted from a specific\nmisconception and to recognize the misconception(s) behind an incorrect answer,\nrespectively. Contrary to traditional LLMs-based mathematical evaluations that\nfocus on answering math questions correctly, our approach takes inspirations\nfrom principles in educational learning sciences. We explicitly ask LLMs to\nmimic a novice learner by answering questions in a specific incorrect manner\nbased on incomplete knowledge; and to mimic an expert tutor by identifying\nmisconception(s) corresponding to an incorrect answer to a question. Using\nsimple grade-school math problems, our experiments reveal that, while LLMs can\neasily answer these questions correctly, they struggle to identify 1) the\nincorrect answer corresponding to specific incomplete knowledge\n(misconceptions); 2) the misconceptions that explain particular incorrect\nanswers. Our study indicates new opportunities for enhancing LLMs' math\nreasoning capabilities, especially on developing robust student simulation and\nexpert tutoring models in the educational applications such as intelligent\ntutoring systems.\n","authors":["Naiming Liu","Shashank Sonkar","Zichao Wang","Simon Woodhead","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2310.02439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02431v1","updated":"2023-10-03T20:54:29Z","published":"2023-10-03T20:54:29Z","title":"Can Large Language Models Provide Security & Privacy Advice? Measuring\n  the Ability of LLMs to Refute Misconceptions","summary":"  Users seek security & privacy (S&P) advice from online resources, including\ntrusted websites and content-sharing platforms. These resources help users\nunderstand S&P technologies and tools and suggest actionable strategies. Large\nLanguage Models (LLMs) have recently emerged as trusted information sources.\nHowever, their accuracy and correctness have been called into question. Prior\nresearch has outlined the shortcomings of LLMs in answering multiple-choice\nquestions and user ability to inadvertently circumvent model restrictions\n(e.g., to produce toxic content). Yet, the ability of LLMs to provide reliable\nS&P advice is not well-explored. In this paper, we measure their ability to\nrefute popular S&P misconceptions that the general public holds. We first study\nrecent academic literature to curate a dataset of over a hundred S&P-related\nmisconceptions across six different topics. We then query two popular LLMs\n(Bard and ChatGPT) and develop a labeling guide to evaluate their responses to\nthese misconceptions. To comprehensively evaluate their responses, we further\napply three strategies: query each misconception multiple times, generate and\nquery their paraphrases, and solicit source URLs of the responses. Both models\ndemonstrate, on average, a 21.3% non-negligible error rate, incorrectly\nsupporting popular S&P misconceptions. The error rate increases to 32.6% when\nwe repeatedly query LLMs with the same or paraphrased misconceptions. We also\nexpose that models may partially support a misconception or remain\nnoncommittal, refusing a firm stance on misconceptions. Our exploration of\ninformation sources for responses revealed that LLMs are susceptible to\nproviding invalid URLs (21.2% for Bard and 67.7% for ChatGPT) or point to\nunrelated sources (44.2% returned by Bard and 18.3% by ChatGPT).\n","authors":["Yufan Chen","Arjun Arunasalam","Z. Berkay Celik"],"pdf_url":"https://arxiv.org/pdf/2310.02431v1.pdf","comment":"Accepted to the Annual Computer Security Applications Conference\n  (ACSAC), 2023"},{"id":"http://arxiv.org/abs/2308.08155v2","updated":"2023-10-03T20:47:10Z","published":"2023-08-16T05:57:52Z","title":"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation","summary":"  AutoGen is an open-source framework that allows developers to build LLM\napplications via multiple agents that can converse with each other to\naccomplish tasks. AutoGen agents are customizable, conversable, and can operate\nin various modes that employ combinations of LLMs, human inputs, and tools.\nUsing AutoGen, developers can also flexibly define agent interaction behaviors.\nBoth natural language and computer code can be used to program flexible\nconversation patterns for different applications. AutoGen serves as a generic\ninfrastructure to build diverse applications of various complexities and LLM\ncapacities. Empirical studies demonstrate the effectiveness of the framework in\nmany example applications, with domains ranging from mathematics, coding,\nquestion answering, operations research, online decision-making, entertainment,\netc.\n","authors":["Qingyun Wu","Gagan Bansal","Jieyu Zhang","Yiran Wu","Beibin Li","Erkang Zhu","Li Jiang","Xiaoyun Zhang","Shaokun Zhang","Jiale Liu","Ahmed Hassan Awadallah","Ryen W White","Doug Burger","Chi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08155v2.pdf","comment":"43 pages (10 pages for the main text, 3 pages for references, and 30\n  pages for appendices)"},{"id":"http://arxiv.org/abs/2310.02421v1","updated":"2023-10-03T20:34:59Z","published":"2023-10-03T20:34:59Z","title":"Can a student Large Language Model perform as well as it's teacher?","summary":"  The burgeoning complexity of contemporary deep learning models, while\nachieving unparalleled accuracy, has inadvertently introduced deployment\nchallenges in resource-constrained environments. Knowledge distillation, a\ntechnique aiming to transfer knowledge from a high-capacity \"teacher\" model to\na streamlined \"student\" model, emerges as a promising solution to this dilemma.\nThis paper provides a comprehensive overview of the knowledge distillation\nparadigm, emphasizing its foundational principles such as the utility of soft\nlabels and the significance of temperature scaling. Through meticulous\nexamination, we elucidate the critical determinants of successful distillation,\nincluding the architecture of the student model, the caliber of the teacher,\nand the delicate balance of hyperparameters. While acknowledging its profound\nadvantages, we also delve into the complexities and challenges inherent in the\nprocess. Our exploration underscores knowledge distillation's potential as a\npivotal technique in optimizing the trade-off between model performance and\ndeployment efficiency.\n","authors":["Sia Gholami","Marwan Omar"],"pdf_url":"https://arxiv.org/pdf/2310.02421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10313v3","updated":"2023-10-03T20:18:39Z","published":"2023-09-19T04:51:13Z","title":"Investigating the Catastrophic Forgetting in Multimodal Large Language\n  Models","summary":"  Following the success of GPT4, there has been a surge in interest in\nmultimodal large language model (MLLM) research. This line of research focuses\non developing general-purpose LLMs through fine-tuning pre-trained LLMs and\nvision models. However, catastrophic forgetting, a notorious phenomenon where\nthe fine-tuned model fails to retain similar performance compared to the\npre-trained model, still remains an inherent problem in multimodal LLMs (MLLM).\nIn this paper, we introduce EMT: Evaluating MulTimodality for evaluating the\ncatastrophic forgetting in MLLMs, by treating each MLLM as an image classifier.\nWe first apply EMT to evaluate several open-source fine-tuned MLLMs and we\ndiscover that almost all evaluated MLLMs fail to retain the same performance\nlevels as their vision encoders on standard image classification tasks.\nMoreover, we continue fine-tuning LLaVA, an MLLM and utilize EMT to assess\nperformance throughout the fine-tuning. Interestingly, our results suggest that\nearly-stage fine-tuning on an image dataset improves performance across other\nimage datasets, by enhancing the alignment of text and visual features.\nHowever, as fine-tuning proceeds, the MLLMs begin to hallucinate, resulting in\na significant loss of generalizability, even when the image encoder remains\nfrozen. Our results suggest that MLLMs have yet to demonstrate performance on\npar with their vision models on standard image classification tasks and the\ncurrent MLLM fine-tuning procedure still has room for improvement.\n","authors":["Yuexiang Zhai","Shengbang Tong","Xiao Li","Mu Cai","Qing Qu","Yong Jae Lee","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2309.10313v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02410v1","updated":"2023-10-03T20:11:23Z","published":"2023-10-03T20:11:23Z","title":"Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit\n  Quantization and Robustness","summary":"  Large Mixture of Experts (MoE) models could achieve state-of-the-art quality\non various language tasks, including machine translation task, thanks to the\nefficient model scaling capability with expert parallelism. However, it has\nbrought a fundamental issue of larger memory consumption and increased memory\nbandwidth bottleneck at deployment time. In this paper, we propose Mixture of\nQuantized Experts (MoQE) which is a simple weight-only quantization method\napplying ultra low-bit down to 2-bit quantizations only to expert weights for\nmitigating the increased memory and latency issues of MoE models. We show that\nlow-bit quantization together with the MoE architecture delivers a reliable\nmodel performance while reducing the memory size significantly even without any\nadditional training in most cases. In particular, expert layers in MoE models\nare much more robust to the quantization than conventional feedforward networks\n(FFN) layers. In our comprehensive analysis, we show that MoE models with 2-bit\nexpert weights can deliver better model performance than the dense model\ntrained on the same dataset. As a result of low-bit quantization, we show the\nmodel size can be reduced by 79.6% of the original half precision floating\npoint (fp16) MoE model. Combined with an optimized GPU runtime implementation,\nit also achieves 1.24X speed-up on A100 GPUs.\n","authors":["Young Jin Kim","Raffy Fahim","Hany Hassan Awadalla"],"pdf_url":"https://arxiv.org/pdf/2310.02410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02409v1","updated":"2023-10-03T20:07:06Z","published":"2023-10-03T20:07:06Z","title":"Nugget 2D: Dynamic Contextual Compression for Scaling Decoder-only\n  Language Models","summary":"  Standard Transformer-based language models (LMs) scale poorly to long\ncontexts. We propose a solution based on dynamic contextual compression, which\nextends the Nugget approach of Qin & Van Durme (2023) from BERT-like frameworks\nto decoder-only LMs. Our method models history as compressed \"nuggets\" which\nare trained to allow for reconstruction, and it can be initialized with\noff-the-shelf models such as LLaMA. We demonstrate through experiments in\nlanguage modeling, question answering, and summarization that Nugget2D retains\ncapabilities in these tasks, while drastically reducing the overhead during\ndecoding in terms of time and space. For example, in the experiments of\nautoencoding, Nugget2D can shrink context at a 20x compression ratio with a\nBLEU score of 98% for reconstruction, achieving nearly lossless encoding.\n","authors":["Guanghui Qin","Corby Rosset","Ethan C. Chau","Nikhil Rao","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2310.02409v1.pdf","comment":"Preprint. 15 pages and 7 figures"},{"id":"http://arxiv.org/abs/2310.02408v1","updated":"2023-10-03T20:03:08Z","published":"2023-10-03T20:03:08Z","title":"MindTheDApp: A Toolchain for Complex Network-Driven Structural Analysis\n  of Ethereum-based Decentralised Applications","summary":"  This paper presents MindTheDApp, a toolchain designed specifically for the\nstructural analysis of Ethereum-based Decentralized Applications (DApps), with\na distinct focus on a complex network-driven approach. Unlike existing tools,\nour toolchain combines the power of ANTLR4 and Abstract Syntax Tree (AST)\ntraversal techniques to transform the architecture and interactions within\nsmart contracts into a specialized bipartite graph. This enables advanced\nnetwork analytics to highlight operational efficiencies within the DApp's\narchitecture.\n  The bipartite graph generated by the proposed tool comprises two sets of\nnodes: one representing smart contracts, interfaces, and libraries, and the\nother including functions, events, and modifiers. Edges in the graph connect\nfunctions to smart contracts they interact with, offering a granular view of\ninterdependencies and execution flow within the DApp. This network-centric\napproach allows researchers and practitioners to apply complex network theory\nin understanding the robustness, adaptability, and intricacies of decentralized\nsystems.\n  Our work contributes to the enhancement of security in smart contracts by\nallowing the visualisation of the network, and it provides a deep understanding\nof the architecture and operational logic within DApps. Given the growing\nimportance of smart contracts in the blockchain ecosystem and the emerging\napplication of complex network theory in technology, our toolchain offers a\ntimely contribution to both academic research and practical applications in the\nfield of blockchain technology.\n","authors":["Giacomo Ibba","Sabrina Aufiero","Silvia Bartolucci","Rumyana Neykova","Marco Ortu","Roberto Tonelli","Giuseppe Destefanis"],"pdf_url":"https://arxiv.org/pdf/2310.02408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03872v3","updated":"2023-10-03T19:48:22Z","published":"2023-06-06T17:18:56Z","title":"Deductive Verification of Chain-of-Thought Reasoning","summary":"  Large Language Models (LLMs) significantly benefit from Chain-of-Thought\n(CoT) prompting in performing various reasoning tasks. While CoT allows models\nto produce more comprehensive reasoning processes, its emphasis on intermediate\nreasoning steps can inadvertently introduce hallucinations and accumulated\nerrors, thereby limiting models' ability to solve complex reasoning tasks.\nInspired by how humans engage in careful and meticulous deductive logical\nreasoning processes to solve tasks, we seek to enable language models to\nperform explicit and rigorous deductive reasoning, and also ensure the\ntrustworthiness of their reasoning process through self-verification. However,\ndirectly verifying the validity of an entire deductive reasoning process is\nchallenging, even with advanced models like ChatGPT. In light of this, we\npropose to decompose a reasoning verification process into a series of\nstep-by-step subprocesses, each only receiving their necessary context and\npremises. To facilitate this procedure, we propose Natural Program, a natural\nlanguage-based deductive reasoning format. Our approach enables models to\ngenerate precise reasoning steps where subsequent steps are more rigorously\ngrounded on prior steps. It also empowers language models to carry out\nreasoning self-verification in a step-by-step manner. By integrating this\nverification process into each deductive reasoning stage, we significantly\nenhance the rigor and trustfulness of generated reasoning steps. Along this\nprocess, we also improve the answer correctness on complex reasoning tasks.\nCode will be released at https://github.com/lz1oceani/verify_cot.\n","authors":["Zhan Ling","Yunhao Fang","Xuanlin Li","Zhiao Huang","Mingu Lee","Roland Memisevic","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2306.03872v3.pdf","comment":"Published at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.11244v2","updated":"2023-10-03T19:45:25Z","published":"2023-05-18T18:15:53Z","title":"A Parameter-Efficient Learning Approach to Arabic Dialect Identification\n  with Pre-Trained General-Purpose Speech Model","summary":"  In this work, we explore Parameter-Efficient-Learning (PEL) techniques to\nrepurpose a General-Purpose-Speech (GSM) model for Arabic dialect\nidentification (ADI). Specifically, we investigate different setups to\nincorporate trainable features into a multi-layer encoder-decoder GSM\nformulation under frozen pre-trained settings. Our architecture includes\nresidual adapter and model reprogramming (input-prompting). We design a\ntoken-level label mapping to condition the GSM for Arabic Dialect\nIdentification (ADI). This is challenging due to the high variation in\nvocabulary and pronunciation among the numerous regional dialects. We achieve\nnew state-of-the-art accuracy on the ADI-17 dataset by vanilla fine-tuning. We\nfurther reduce the training budgets with the PEL method, which performs within\n1.86% accuracy to fine-tuning using only 2.5% of (extra) network trainable\nparameters. Our study demonstrates how to identify Arabic dialects using a\nsmall dataset and limited computation with open source code and pre-trained\nmodels.\n","authors":["Srijith Radhakrishnan","Chao-Han Huck Yang","Sumeer Ahmad Khan","Narsis A. Kiani","David Gomez-Cabrero","Jesper N. Tegner"],"pdf_url":"https://arxiv.org/pdf/2305.11244v2.pdf","comment":"Accepted to Interspeech 2023, 5 pages. Code is available at:\n  https://github.com/Srijith-rkr/KAUST-Whisper-Adapter under MIT license"},{"id":"http://arxiv.org/abs/2302.01313v6","updated":"2023-10-03T19:31:48Z","published":"2023-02-02T18:39:30Z","title":"Double Equivariance for Inductive Link Prediction for Both New Nodes and\n  New Relation Types","summary":"  The task of inductive link prediction in discrete attributed multigraphs\n(e.g., knowledge graphs, multilayer networks, heterogeneous networks, etc.)\ngenerally focuses on test predictions with solely new nodes but not both new\nnodes and new relation types. In this work, we formally define the task of\npredicting (completely) new nodes and new relation types in test as a doubly\ninductive link prediction task and introduce a theoretical framework for the\nsolution. We start by defining the concept of double permutation-equivariant\nrepresentations that are equivariant to permutations of both node identities\nand edge relation types. We then propose a general blueprint to design neural\narchitectures that impose a structural representation of relations that can\ninductively generalize from training nodes and relations to arbitrarily new\ntest nodes and relations without the need for adaptation, side information, or\nretraining. We also introduce the concept of distributionally double\nequivariant positional embeddings designed to perform the same task. Finally,\nwe empirically demonstrate the capability of the two proposed models on a set\nof novel real-world benchmarks, showcasing average relative performance gains\nof $39.65\\%$ on predicting new relations types compared to baselines.\n","authors":["Jianfei Gao","Yangze Zhou","Jincheng Zhou","Bruno Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2302.01313v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02382v1","updated":"2023-10-03T19:05:32Z","published":"2023-10-03T19:05:32Z","title":"Unsupervised Speech Recognition with N-Skipgram and Positional Unigram\n  Matching","summary":"  Training unsupervised speech recognition systems presents challenges due to\nGAN-associated instability, misalignment between speech and text, and\nsignificant memory demands. To tackle these challenges, we introduce a novel\nASR system, ESPUM. This system harnesses the power of lower-order N-skipgrams\n(up to N=3) combined with positional unigram statistics gathered from a small\nbatch of samples. Evaluated on the TIMIT benchmark, our model showcases\ncompetitive performance in ASR and phoneme segmentation tasks. Access our\npublicly available code at https://github.com/lwang114/GraphUnsupASR.\n","authors":["Liming Wang","Mark Hasegawa-Johnson","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2310.02382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02374v1","updated":"2023-10-03T18:54:10Z","published":"2023-10-03T18:54:10Z","title":"Conversational Health Agents: A Personalized LLM-Powered Agent Framework","summary":"  Conversational Health Agents (CHAs) are interactive systems designed to\nenhance personal healthcare services by engaging in empathetic conversations\nand processing multimodal data. While current CHAs, especially those utilizing\nLarge Language Models (LLMs), primarily focus on conversation, they often lack\ncomprehensive agent capabilities. This includes the ability to access personal\nuser health data from wearables, 24/7 data collection sources, and electronic\nhealth records, as well as integrating the latest published health insights and\nconnecting with established multimodal data analysis tools. We are developing a\nframework to empower CHAs by equipping them with critical thinking, knowledge\nacquisition, and problem-solving abilities. Our CHA platform, powered by LLMs,\nseamlessly integrates healthcare tools, enables multilingual and multimodal\nconversations, and interfaces with a variety of user data analysis tools. We\nillustrate its proficiency in handling complex healthcare tasks, such as stress\nlevel estimation, showcasing the agent's cognitive and operational\ncapabilities.\n","authors":["Mahyar Abbasian","Iman Azimi","Amir M. Rahmani","Ramesh Jain"],"pdf_url":"https://arxiv.org/pdf/2310.02374v1.pdf","comment":"22 pages, 5 figures, journal paper"},{"id":"http://arxiv.org/abs/2310.02372v1","updated":"2023-10-03T18:52:19Z","published":"2023-10-03T18:52:19Z","title":"ProtoNER: Few shot Incremental Learning for Named Entity Recognition\n  using Prototypical Networks","summary":"  Key value pair (KVP) extraction or Named Entity Recognition(NER) from\nvisually rich documents has been an active area of research in document\nunderstanding and data extraction domain. Several transformer based models such\nas LayoutLMv2, LayoutLMv3, and LiLT have emerged achieving state of the art\nresults. However, addition of even a single new class to the existing model\nrequires (a) re-annotation of entire training dataset to include this new class\nand (b) retraining the model again. Both of these issues really slow down the\ndeployment of updated model. \\\\ We present \\textbf{ProtoNER}: Prototypical\nNetwork based end-to-end KVP extraction model that allows addition of new\nclasses to an existing model while requiring minimal number of newly annotated\ntraining samples. The key contributions of our model are: (1) No dependency on\ndataset used for initial training of the model, which alleviates the need to\nretain original training dataset for longer duration as well as data\nre-annotation which is very time consuming task, (2) No intermediate synthetic\ndata generation which tends to add noise and results in model's performance\ndegradation, and (3) Hybrid loss function which allows model to retain\nknowledge about older classes as well as learn about newly added classes.\\\\\nExperimental results show that ProtoNER finetuned with just 30 samples is able\nto achieve similar results for the newly added classes as that of regular model\nfinetuned with 2600 samples.\n","authors":["Ritesh Kumar","Saurabh Goyal","Ashish Verma","Vatche Isahagian"],"pdf_url":"https://arxiv.org/pdf/2310.02372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02357v1","updated":"2023-10-03T18:32:34Z","published":"2023-10-03T18:32:34Z","title":"On the definition of toxicity in NLP","summary":"  The fundamental problem in toxicity detection task lies in the fact that the\ntoxicity is ill-defined. Jigsaw, a unit within Google and one of the leaders in\nthe field, uses a definition of toxicity given by Dixon et al. - 'rude,\ndisrespectful, or unreasonable language that is likely to make someone leave a\ndiscussion'. One can instantly see the issue with this definition, as it gives\nno quantitative measure of the toxicity and operates with highly subjective\ncultural terms. Despite all vagueness and flaws, this definition is de-facto\nwidely used by many researchers. In this work we suggest quantative\nstress-based defenition for the toxicity that overcomes existing shortcomings.\n","authors":["Sergey Berezin","Reza Farahbakhsh","Noel Crespi"],"pdf_url":"https://arxiv.org/pdf/2310.02357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00978v2","updated":"2023-10-03T18:20:01Z","published":"2023-06-01T17:59:10Z","title":"AWQ: Activation-aware Weight Quantization for LLM Compression and\n  Acceleration","summary":"  Large language models (LLMs) have shown excellent performance on various\ntasks, but the astronomical model size raises the hardware barrier for serving\n(memory size) and slows down token generation (memory bandwidth). In this\npaper, we propose Activation-aware Weight Quantization (AWQ), a\nhardware-friendly approach for LLM low-bit weight-only quantization. Our method\nis based on the observation that weights are not equally important: protecting\nonly 1% of salient weights can greatly reduce quantization error. We then\npropose to search for the optimal per-channel scaling that protects the salient\nweights by observing the activation, not weights. AWQ does not rely on any\nbackpropagation or reconstruction, so it can well preserve LLMs' generalization\nability on different domains and modalities, without overfitting to the\ncalibration set. AWQ outperforms existing work on various language modeling and\ndomain-specific benchmarks. Thanks to better generalization, it achieves\nexcellent quantization performance for instruction-tuned LMs and, for the first\ntime, multi-modal LMs. Alongside AWQ, we implement an efficient and flexible\ninference framework tailored for LLMs on the edge, offering more than 3x\nspeedup over the Huggingface FP16 implementation on both desktop and mobile\nGPUs. It also democratizes the deployment of the 70B Llama-2 model on mobile\nGPU (NVIDIA Jetson Orin 64GB).\n","authors":["Ji Lin","Jiaming Tang","Haotian Tang","Shang Yang","Xingyu Dang","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2306.00978v2.pdf","comment":"Code available at: https://github.com/mit-han-lab/llm-awq"},{"id":"http://arxiv.org/abs/2310.02304v1","updated":"2023-10-03T17:59:32Z","published":"2023-10-03T17:59:32Z","title":"Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation","summary":"  Several recent advances in AI systems (e.g., Tree-of-Thoughts and\nProgram-Aided Language Models) solve problems by providing a \"scaffolding\"\nprogram that structures multiple calls to language models to generate better\noutputs. A scaffolding program is written in a programming language such as\nPython. In this work, we use a language-model-infused scaffolding program to\nimprove itself. We start with a seed \"improver\" that improves an input program\naccording to a given utility function by querying a language model several\ntimes and returning the best solution. We then run this seed improver to\nimprove itself. Across a small set of downstream tasks, the resulting improved\nimprover generates programs with significantly better performance than its seed\nimprover. Afterward, we analyze the variety of self-improvement strategies\nproposed by the language model, including beam search, genetic algorithms, and\nsimulated annealing. Since the language models themselves are not altered, this\nis not full recursive self-improvement. Nonetheless, it demonstrates that a\nmodern language model, GPT-4 in our proof-of-concept experiments, is capable of\nwriting code that can call itself to improve itself. We critically consider\nconcerns around the development of self-improving technologies and evaluate the\nfrequency with which the generated code bypasses a sandbox.\n","authors":["Eric Zelikman","Eliana Lorch","Lester Mackey","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2310.02304v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.02265v1","updated":"2023-10-03T17:59:58Z","published":"2023-10-03T17:59:58Z","title":"DREAM: Visual Decoding from Reversing Human Visual System","summary":"  In this work we present DREAM, an fMRI-to-image method for reconstructing\nviewed images from brain activities, grounded on fundamental knowledge of the\nhuman visual system. We craft reverse pathways that emulate the hierarchical\nand parallel nature of how humans perceive the visual world. These tailored\npathways are specialized to decipher semantics, color, and depth cues from fMRI\ndata, mirroring the forward pathways from visual stimuli to fMRI recordings. To\ndo so, two components mimic the inverse processes within the human visual\nsystem: the Reverse Visual Association Cortex (R-VAC) which reverses pathways\nof this brain region, extracting semantics from fMRI data; the Reverse Parallel\nPKM (R-PKM) component simultaneously predicting color and depth from fMRI\nsignals. The experiments indicate that our method outperforms the current\nstate-of-the-art models in terms of the consistency of appearance, structure,\nand semantics. Code will be made publicly available to facilitate further\nresearch in this field.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2310.02265v1.pdf","comment":"Project Page: https://weihaox.github.io/DREAM"},{"id":"http://arxiv.org/abs/2310.02264v1","updated":"2023-10-03T17:59:46Z","published":"2023-10-03T17:59:46Z","title":"Generalizable Long-Horizon Manipulations with Large Language Models","summary":"  This work introduces a framework harnessing the capabilities of Large\nLanguage Models (LLMs) to generate primitive task conditions for generalizable\nlong-horizon manipulations with novel objects and unseen tasks. These task\nconditions serve as guides for the generation and adjustment of Dynamic\nMovement Primitives (DMP) trajectories for long-horizon task execution. We\nfurther create a challenging robotic manipulation task suite based on Pybullet\nfor long-horizon task evaluation. Extensive experiments in both simulated and\nreal-world environments demonstrate the effectiveness of our framework on both\nfamiliar tasks involving new objects and novel but related tasks, highlighting\nthe potential of LLMs in enhancing robotic system versatility and adaptability.\nProject website: https://object814.github.io/Task-Condition-With-LLM/\n","authors":["Haoyu Zhou","Mingyu Ding","Weikun Peng","Masayoshi Tomizuka","Lin Shao","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2310.02264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02262v1","updated":"2023-10-03T17:59:32Z","published":"2023-10-03T17:59:32Z","title":"RSRD: A Road Surface Reconstruction Dataset and Benchmark for Safe and\n  Comfortable Autonomous Driving","summary":"  This paper addresses the growing demands for safety and comfort in\nintelligent robot systems, particularly autonomous vehicles, where road\nconditions play a pivotal role in overall driving performance. For example,\nreconstructing road surfaces helps to enhance the analysis and prediction of\nvehicle responses for motion planning and control systems. We introduce the\nRoad Surface Reconstruction Dataset (RSRD), a real-world, high-resolution, and\nhigh-precision dataset collected with a specialized platform in diverse driving\nconditions. It covers common road types containing approximately 16,000 pairs\nof stereo images, original point clouds, and ground-truth depth/disparity maps,\nwith accurate post-processing pipelines to ensure its quality. Based on RSRD,\nwe further build a comprehensive benchmark for recovering road profiles through\ndepth estimation and stereo matching. Preliminary evaluations with various\nstate-of-the-art methods reveal the effectiveness of our dataset and the\nchallenge of the task, underscoring substantial opportunities of RSRD as a\nvaluable resource for advancing techniques, e.g., multi-view stereo towards\nsafe autonomous driving. The dataset and demo videos are available at\nhttps://thu-rsxd.com/rsrd/\n","authors":["Tong Zhao","Chenfeng Xu","Mingyu Ding","Masayoshi Tomizuka","Wei Zhan","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02260v1","updated":"2023-10-03T17:59:05Z","published":"2023-10-03T17:59:05Z","title":"TransRadar: Adaptive-Directional Transformer for Real-Time Multi-View\n  Radar Semantic Segmentation","summary":"  Scene understanding plays an essential role in enabling autonomous driving\nand maintaining high standards of performance and safety. To address this task,\ncameras and laser scanners (LiDARs) have been the most commonly used sensors,\nwith radars being less popular. Despite that, radars remain low-cost,\ninformation-dense, and fast-sensing techniques that are resistant to adverse\nweather conditions. While multiple works have been previously presented for\nradar-based scene semantic segmentation, the nature of the radar data still\nposes a challenge due to the inherent noise and sparsity, as well as the\ndisproportionate foreground and background. In this work, we propose a novel\napproach to the semantic segmentation of radar scenes using a multi-input\nfusion of radar data through a novel architecture and loss functions that are\ntailored to tackle the drawbacks of radar perception. Our novel architecture\nincludes an efficient attention block that adaptively captures important\nfeature information. Our method, TransRadar, outperforms state-of-the-art\nmethods on the CARRADA and RADIal datasets while having smaller model sizes.\nhttps://github.com/YahiDar/TransRadar\n","authors":["Yahia Dalbah","Jean Lahoud","Hisham Cholakkal"],"pdf_url":"https://arxiv.org/pdf/2310.02260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02255v1","updated":"2023-10-03T17:57:24Z","published":"2023-10-03T17:57:24Z","title":"MathVista: Evaluating Mathematical Reasoning of Foundation Models in\n  Visual Contexts","summary":"  Although Large Language Models (LLMs) and Large Multimodal Models (LMMs)\nexhibit impressive skills in various domains, their ability for mathematical\nreasoning within visual contexts has not been formally examined. Equipping LLMs\nand LMMs with this capability is vital for general-purpose AI assistants and\nshowcases promising potential in education, data analysis, and scientific\ndiscovery. To bridge this gap, we present MathVista, a benchmark designed to\namalgamate challenges from diverse mathematical and visual tasks. We first\ntaxonomize the key task types, reasoning skills, and visual contexts from the\nliterature to guide our selection from 28 existing math-focused and visual\nquestion answering datasets. Then, we construct three new datasets, IQTest,\nFunctionQA, and PaperQA, to accommodate for missing types of visual contexts.\nThe problems featured often require deep visual understanding beyond OCR or\nimage captioning, and compositional reasoning with rich domain-specific tools,\nthus posing a notable challenge to existing models. We conduct a comprehensive\nevaluation of 11 prominent open-source and proprietary foundation models (LLMs,\nLLMs augmented with tools, and LMMs), and early experiments with GPT-4V. The\nbest-performing model, Multimodal Bard, achieves only 58% of human performance\n(34.8% vs 60.3%), indicating ample room for further improvement. Given this\nsignificant gap, MathVista fuels future research in the development of\ngeneral-purpose AI agents capable of tackling mathematically intensive and\nvisually rich real-world tasks. Preliminary tests show that MathVista also\npresents challenges to GPT-4V, underscoring the benchmark's importance. The\nproject is available at https://mathvista.github.io/.\n","authors":["Pan Lu","Hritik Bansal","Tony Xia","Jiacheng Liu","Chunyuan Li","Hannaneh Hajishirzi","Hao Cheng","Kai-Wei Chang","Michel Galley","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.02255v1.pdf","comment":"51 pages, 56 figures. Work in progress"},{"id":"http://arxiv.org/abs/2310.02251v1","updated":"2023-10-03T17:53:51Z","published":"2023-10-03T17:53:51Z","title":"Talk2BEV: Language-enhanced Bird's-eye View Maps for Autonomous Driving","summary":"  Talk2BEV is a large vision-language model (LVLM) interface for bird's-eye\nview (BEV) maps in autonomous driving contexts. While existing perception\nsystems for autonomous driving scenarios have largely focused on a pre-defined\n(closed) set of object categories and driving scenarios, Talk2BEV blends recent\nadvances in general-purpose language and vision models with BEV-structured map\nrepresentations, eliminating the need for task-specific models. This enables a\nsingle system to cater to a variety of autonomous driving tasks encompassing\nvisual and spatial reasoning, predicting the intents of traffic actors, and\ndecision-making based on visual cues. We extensively evaluate Talk2BEV on a\nlarge number of scene understanding tasks that rely on both the ability to\ninterpret free-form natural language queries, and in grounding these queries to\nthe visual context embedded into the language-enhanced BEV map. To enable\nfurther research in LVLMs for autonomous driving scenarios, we develop and\nrelease Talk2BEV-Bench, a benchmark encompassing 1000 human-annotated BEV\nscenarios, with more than 20,000 questions and ground-truth responses from the\nNuScenes dataset.\n","authors":["Vikrant Dewangan","Tushar Choudhary","Shivam Chandhok","Shubham Priyadarshan","Anushka Jain","Arun K. Singh","Siddharth Srivastava","Krishna Murthy Jatavallabhula","K. Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2310.02251v1.pdf","comment":"Submitted to ICRA 2024. Project page at\n  https://llmbev.github.io/talk2bev/"},{"id":"http://arxiv.org/abs/2310.02242v1","updated":"2023-10-03T17:50:23Z","published":"2023-10-03T17:50:23Z","title":"Hierarchical Generation of Human-Object Interactions with Diffusion\n  Probabilistic Models","summary":"  This paper presents a novel approach to generating the 3D motion of a human\ninteracting with a target object, with a focus on solving the challenge of\nsynthesizing long-range and diverse motions, which could not be fulfilled by\nexisting auto-regressive models or path planning-based methods. We propose a\nhierarchical generation framework to solve this challenge. Specifically, our\nframework first generates a set of milestones and then synthesizes the motion\nalong them. Therefore, the long-range motion generation could be reduced to\nsynthesizing several short motion sequences guided by milestones. The\nexperiments on the NSM, COUCH, and SAMP datasets show that our approach\noutperforms previous methods by a large margin in both quality and diversity.\nThe source code is available on our project page\nhttps://zju3dv.github.io/hghoi.\n","authors":["Huaijin Pi","Sida Peng","Minghui Yang","Xiaowei Zhou","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2310.02242v1.pdf","comment":"ICCV 2023. Project page: https://zju3dv.github.io/hghoi"},{"id":"http://arxiv.org/abs/2310.02239v1","updated":"2023-10-03T17:49:04Z","published":"2023-10-03T17:49:04Z","title":"MiniGPT-5: Interleaved Vision-and-Language Generation via Generative\n  Vokens","summary":"  Large Language Models (LLMs) have garnered significant attention for their\nadvancements in natural language processing, demonstrating unparalleled prowess\nin text comprehension and generation. Yet, the simultaneous generation of\nimages with coherent textual narratives remains an evolving frontier. In\nresponse, we introduce an innovative interleaved vision-and-language generation\ntechnique anchored by the concept of \"generative vokens,\" acting as the bridge\nfor harmonized image-text outputs. Our approach is characterized by a\ndistinctive two-staged training strategy focusing on description-free\nmultimodal generation, where the training requires no comprehensive\ndescriptions of images. To bolster model integrity, classifier-free guidance is\nincorporated, enhancing the effectiveness of vokens on image generation. Our\nmodel, MiniGPT-5, exhibits substantial improvement over the baseline Divter\nmodel on the MMDialog dataset and consistently delivers superior or comparable\nmultimodal outputs in human evaluations on the VIST dataset, highlighting its\nefficacy across diverse benchmarks.\n","authors":["Kaizhi Zheng","Xuehai He","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02239v1.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.12955v2","updated":"2023-10-03T17:48:40Z","published":"2023-09-22T15:54:04Z","title":"On Data Fabrication in Collaborative Vehicular Perception: Attacks and\n  Countermeasures","summary":"  Collaborative perception, which greatly enhances the sensing capability of\nconnected and autonomous vehicles (CAVs) by incorporating data from external\nresources, also brings forth potential security risks. CAVs' driving decisions\nrely on remote untrusted data, making them susceptible to attacks carried out\nby malicious participants in the collaborative perception system. However,\nsecurity analysis and countermeasures for such threats are absent. To\nunderstand the impact of the vulnerability, we break the ground by proposing\nvarious real-time data fabrication attacks in which the attacker delivers\ncrafted malicious data to victims in order to perturb their perception results,\nleading to hard brakes or increased collision risks. Our attacks demonstrate a\nhigh success rate of over 86% on high-fidelity simulated scenarios and are\nrealizable in real-world experiments. To mitigate the vulnerability, we present\na systematic anomaly detection approach that enables benign vehicles to jointly\nreveal malicious fabrication. It detects 91.5% of attacks with a false positive\nrate of 3% in simulated scenarios and significantly mitigates attack impacts in\nreal-world scenarios.\n","authors":["Qingzhao Zhang","Shuowei Jin","Ruiyang Zhu","Jiachen Sun","Xumiao Zhang","Qi Alfred Chen","Z. Morley Mao"],"pdf_url":"https://arxiv.org/pdf/2309.12955v2.pdf","comment":"18 pages, 24 figures, accepted by Usenix Security 2024"},{"id":"http://arxiv.org/abs/2310.02237v1","updated":"2023-10-03T17:47:25Z","published":"2023-10-03T17:47:25Z","title":"Exploring Model Learning Heterogeneity for Boosting Ensemble Robustness","summary":"  Deep neural network ensembles hold the potential of improving generalization\nperformance for complex learning tasks. This paper presents formal analysis and\nempirical evaluation to show that heterogeneous deep ensembles with high\nensemble diversity can effectively leverage model learning heterogeneity to\nboost ensemble robustness. We first show that heterogeneous DNN models trained\nfor solving the same learning problem, e.g., object detection, can\nsignificantly strengthen the mean average precision (mAP) through our weighted\nbounding box ensemble consensus method. Second, we further compose ensembles of\nheterogeneous models for solving different learning problems, e.g., object\ndetection and semantic segmentation, by introducing the connected component\nlabeling (CCL) based alignment. We show that this two-tier heterogeneity driven\nensemble construction method can compose an ensemble team that promotes high\nensemble diversity and low negative correlation among member models of the\nensemble, strengthening ensemble robustness against both negative examples and\nadversarial attacks. Third, we provide a formal analysis of the ensemble\nrobustness in terms of negative correlation. Extensive experiments validate the\nenhanced robustness of heterogeneous ensembles in both benign and adversarial\nsettings. The source codes are available on GitHub at\nhttps://github.com/git-disl/HeteRobust.\n","authors":["Yanzhao Wu","Ka-Ho Chow","Wenqi Wei","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02237v1.pdf","comment":"Accepted by IEEE ICDM 2023"},{"id":"http://arxiv.org/abs/2310.02234v1","updated":"2023-10-03T17:43:24Z","published":"2023-10-03T17:43:24Z","title":"MIS-AVioDD: Modality Invariant and Specific Representation for\n  Audio-Visual Deepfake Detection","summary":"  Deepfakes are synthetic media generated using deep generative algorithms and\nhave posed a severe societal and political threat. Apart from facial\nmanipulation and synthetic voice, recently, a novel kind of deepfakes has\nemerged with either audio or visual modalities manipulated. In this regard, a\nnew generation of multimodal audio-visual deepfake detectors is being\ninvestigated to collectively focus on audio and visual data for multimodal\nmanipulation detection. Existing multimodal (audio-visual) deepfake detectors\nare often based on the fusion of the audio and visual streams from the video.\nExisting studies suggest that these multimodal detectors often obtain\nequivalent performances with unimodal audio and visual deepfake detectors. We\nconjecture that the heterogeneous nature of the audio and visual signals\ncreates distributional modality gaps and poses a significant challenge to\neffective fusion and efficient performance. In this paper, we tackle the\nproblem at the representation level to aid the fusion of audio and visual\nstreams for multimodal deepfake detection. Specifically, we propose the joint\nuse of modality (audio and visual) invariant and specific representations. This\nensures that the common patterns and patterns specific to each modality\nrepresenting pristine or fake content are preserved and fused for multimodal\ndeepfake manipulation detection. Our experimental results on FakeAVCeleb and\nKoDF audio-visual deepfake datasets suggest the enhanced accuracy of our\nproposed method over SOTA unimodal and multimodal audio-visual deepfake\ndetectors by $17.8$% and $18.4$%, respectively. Thus, obtaining\nstate-of-the-art performance.\n","authors":["Vinaya Sree Katamneni","Ajita Rattani"],"pdf_url":"https://arxiv.org/pdf/2310.02234v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.02230v1","updated":"2023-10-03T17:37:52Z","published":"2023-10-03T17:37:52Z","title":"Leveraging Diffusion Disentangled Representations to Mitigate Shortcuts\n  in Underspecified Visual Tasks","summary":"  Spurious correlations in the data, where multiple cues are predictive of the\ntarget labels, often lead to shortcut learning phenomena, where a model may\nrely on erroneous, easy-to-learn, cues while ignoring reliable ones. In this\nwork, we propose an ensemble diversification framework exploiting the\ngeneration of synthetic counterfactuals using Diffusion Probabilistic Models\n(DPMs). We discover that DPMs have the inherent capability to represent\nmultiple visual cues independently, even when they are largely correlated in\nthe training data. We leverage this characteristic to encourage model diversity\nand empirically show the efficacy of the approach with respect to several\ndiversification objectives. We show that diffusion-guided diversification can\nlead models to avert attention from shortcut cues, achieving ensemble diversity\nperformance comparable to previous methods requiring additional data\ncollection.\n","authors":["Luca Scimeca","Alexander Rubinstein","Armand Nicolicioiu","Damien Teney","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.02230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02219v1","updated":"2023-10-03T17:27:10Z","published":"2023-10-03T17:27:10Z","title":"What do we learn from a large-scale study of pre-trained visual\n  representations in sim and real environments?","summary":"  We present a large empirical investigation on the use of pre-trained visual\nrepresentations (PVRs) for training downstream policies that execute real-world\ntasks. Our study spans five different PVRs, two different policy-learning\nparadigms (imitation and reinforcement learning), and three different robots\nfor 5 distinct manipulation and indoor navigation tasks. From this effort, we\ncan arrive at three insights: 1) the performance trends of PVRs in the\nsimulation are generally indicative of their trends in the real world, 2) the\nuse of PVRs enables a first-of-its-kind result with indoor ImageNav (zero-shot\ntransfer to a held-out scene in the real world), and 3) the benefits from\nvariations in PVRs, primarily data-augmentation and fine-tuning, also transfer\nto the real-world performance. See project website for additional details and\nvisuals.\n","authors":["Sneha Silwal","Karmesh Yadav","Tingfan Wu","Jay Vakil","Arjun Majumdar","Sergio Arnaud","Claire Chen","Vincent-Pierre Berges","Dhruv Batra","Aravind Rajeswaran","Mrinal Kalakrishnan","Franziska Meier","Oleksandr Maksymets"],"pdf_url":"https://arxiv.org/pdf/2310.02219v1.pdf","comment":"Project website https://pvrs-sim2real.github.io/"},{"id":"http://arxiv.org/abs/2310.02201v1","updated":"2023-10-03T16:57:05Z","published":"2023-10-03T16:57:05Z","title":"Learnable Data Augmentation for One-Shot Unsupervised Domain Adaptation","summary":"  This paper presents a classification framework based on learnable data\naugmentation to tackle the One-Shot Unsupervised Domain Adaptation (OS-UDA)\nproblem. OS-UDA is the most challenging setting in Domain Adaptation, as only\none single unlabeled target sample is assumed to be available for model\nadaptation. Driven by such single sample, our method LearnAug-UDA learns how to\naugment source data, making it perceptually similar to the target. As a result,\na classifier trained on such augmented data will generalize well for the target\ndomain. To achieve this, we designed an encoder-decoder architecture that\nexploits a perceptual loss and style transfer strategies to augment the source\ndata. Our method achieves state-of-the-art performance on two well-known Domain\nAdaptation benchmarks, DomainNet and VisDA. The project code is available at\nhttps://github.com/IIT-PAVIS/LearnAug-UDA\n","authors":["Julio Ivan Davila Carrazco","Pietro Morerio","Alessio Del Bue","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2310.02201v1.pdf","comment":"Accepted to The 34th British Machine Vision Conference (BMVC 2023)"},{"id":"http://arxiv.org/abs/2309.16783v2","updated":"2023-10-03T16:34:13Z","published":"2023-09-28T18:22:41Z","title":"Photonic Accelerators for Image Segmentation in Autonomous Driving and\n  Defect Detection","summary":"  Photonic computing promises faster and more energy-efficient deep neural\nnetwork (DNN) inference than traditional digital hardware. Advances in photonic\ncomputing can have profound impacts on applications such as autonomous driving\nand defect detection that depend on fast, accurate and energy efficient\nexecution of image segmentation models. In this paper, we investigate image\nsegmentation on photonic accelerators to explore: a) the types of image\nsegmentation DNN architectures that are best suited for photonic accelerators,\nand b) the throughput and energy efficiency of executing the different image\nsegmentation models on photonic accelerators, along with the trade-offs\ninvolved therein. Specifically, we demonstrate that certain segmentation models\nexhibit negligible loss in accuracy (compared to digital float32 models) when\nexecuted on photonic accelerators, and explore the empirical reasoning for\ntheir robustness. We also discuss techniques for recovering accuracy in the\ncase of models that do not perform well. Further, we compare throughput\n(inferences-per-second) and energy consumption estimates for different image\nsegmentation workloads on photonic accelerators. We discuss the challenges and\npotential optimizations that can help improve the application of photonic\naccelerators to such computer vision tasks.\n","authors":["Lakshmi Nair","David Widemann","Brad Turcott","Nick Moore","Alexandra Wleklinski","Darius Bunandar","Ioannis Papavasileiou","Shihu Wang","Eric Logan"],"pdf_url":"https://arxiv.org/pdf/2309.16783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00527v2","updated":"2023-10-03T16:31:45Z","published":"2023-10-01T00:13:06Z","title":"Self-supervised Learning of Contextualized Local Visual Embeddings","summary":"  We present Contextualized Local Visual Embeddings (CLoVE), a self-supervised\nconvolutional-based method that learns representations suited for dense\nprediction tasks. CLoVE deviates from current methods and optimizes a single\nloss function that operates at the level of contextualized local embeddings\nlearned from output feature maps of convolution neural network (CNN) encoders.\nTo learn contextualized embeddings, CLoVE proposes a normalized mult-head\nself-attention layer that combines local features from different parts of an\nimage based on similarity. We extensively benchmark CLoVE's pre-trained\nrepresentations on multiple datasets. CLoVE reaches state-of-the-art\nperformance for CNN-based architectures in 4 dense prediction downstream tasks,\nincluding object detection, instance segmentation, keypoint detection, and\ndense pose estimation. Code:\n$\\href{https://github.com/sthalles/CLoVE}{\\text{https://github.com/sthalles/CLoVE}}$.\n","authors":["Thalles Santos Silva","Helio Pedrini","Adín Ramírez Rivera"],"pdf_url":"https://arxiv.org/pdf/2310.00527v2.pdf","comment":"Pre-print. 4th Visual Inductive Priors for Data-Efficient Deep\n  Learning Workshop ICCV 2023. Code at\n  $\\href{https://github.com/sthalles/CLoVE}{\\text{this link}}$"},{"id":"http://arxiv.org/abs/2211.13976v4","updated":"2023-10-03T16:13:56Z","published":"2022-11-25T09:38:22Z","title":"Expanding Small-Scale Datasets with Guided Imagination","summary":"  The power of DNNs relies heavily on the quantity and quality of training\ndata. However, collecting and annotating data on a large scale is often\nexpensive and time-consuming. To address this issue, we explore a new task,\ntermed dataset expansion, aimed at expanding a ready-to-use small dataset by\nautomatically creating new labeled samples. To this end, we present a Guided\nImagination Framework (GIF) that leverages cutting-edge generative models like\nDALL-E2 and Stable Diffusion (SD) to \"imagine\" and create informative new data\nfrom the input seed data. Specifically, GIF conducts data imagination by\noptimizing the latent features of the seed data in the semantically meaningful\nspace of the prior model, resulting in the creation of photo-realistic images\nwith new content. To guide the imagination towards creating informative samples\nfor model training, we introduce two key criteria, i.e., class-maintained\ninformation boosting and sample diversity promotion. These criteria are\nverified to be essential for effective dataset expansion: GIF-SD obtains 13.5%\nhigher model accuracy on natural image datasets than unguided expansion with\nSD. With these essential criteria, GIF successfully expands small datasets in\nvarious scenarios, boosting model accuracy by 36.9% on average over six natural\nimage datasets and by 13.5% on average over three medical datasets. The source\ncode is available at https://github.com/Vanint/DatasetExpansion.\n","authors":["Yifan Zhang","Daquan Zhou","Bryan Hooi","Kai Wang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2211.13976v4.pdf","comment":"NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion"},{"id":"http://arxiv.org/abs/2306.11238v2","updated":"2023-10-03T15:50:45Z","published":"2023-06-20T02:21:45Z","title":"CAMP-Net: Consistency-Aware Multi-Prior Network for Accelerated MRI\n  Reconstruction","summary":"  Despite promising advances in deep learning-based MRI reconstruction methods,\nrestoring high-frequency image details and textures remains a challenging\nproblem for accelerated MRI. To tackle this challenge, we propose a novel\nconsistency-aware multi-prior network (CAMP-Net) for MRI reconstruction.\nCAMP-Net leverages the complementary nature of multiple prior knowledge and\nexplores data redundancy between adjacent slices in the hybrid domain to\nimprove image quality. It incorporates three interleaved modules respectively\nfor image enhancement, k-space restoration, and calibration consistency to\njointly learn consistency-aware multiple priors in an end-to-end fashion. The\nimage enhancement module learns a coil-combined image prior to suppress\nnoise-like artifacts, while the k-space restoration module explores multi-coil\nk-space correlations to recover high-frequency details. The calibration\nconsistency module embeds the known physical properties of MRI acquisition to\nensure consistency of k-space correlations extracted from measurements and the\nartifact-free image intermediate. The resulting low- and high-frequency\nreconstructions are hierarchically aggregated in a frequency fusion module and\niteratively refined to progressively reconstruct the final image. We evaluated\nthe generalizability and robustness of our method on three large public\ndatasets with various accelerations and sampling patterns. Comprehensive\nexperiments demonstrate that CAMP-Net outperforms state-of-the-art methods in\nterms of reconstruction quality and quantitative $T_2$ mapping.\n","authors":["Liping Zhang","Xiaobo Li","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2306.11238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02140v1","updated":"2023-10-03T15:24:15Z","published":"2023-10-03T15:24:15Z","title":"PAD-Phys: Exploiting Physiology for Presentation Attack Detection in\n  Face Biometrics","summary":"  Presentation Attack Detection (PAD) is a crucial stage in facial recognition\nsystems to avoid leakage of personal information or spoofing of identity to\nentities. Recently, pulse detection based on remote photoplethysmography (rPPG)\nhas been shown to be effective in face presentation attack detection.\n  This work presents three different approaches to the presentation attack\ndetection based on rPPG: (i) The physiological domain, a domain using\nrPPG-based models, (ii) the Deepfakes domain, a domain where models were\nretrained from the physiological domain to specific Deepfakes detection tasks;\nand (iii) a new Presentation Attack domain was trained by applying transfer\nlearning from the two previous domains to improve the capability to\ndifferentiate between bona-fides and attacks.\n  The results show the efficiency of the rPPG-based models for presentation\nattack detection, evidencing a 21.70% decrease in average classification error\nrate (ACER) (from 41.03% to 19.32%) when the presentation attack domain is\ncompared to the physiological and Deepfakes domains. Our experiments highlight\nthe efficiency of transfer learning in rPPG-based models and perform well in\npresentation attack detection in instruments that do not allow copying of this\nphysiological feature.\n","authors":["Luis F. Gomez","Julian Fierrez","Aythami Morales","Mahdi Ghafourian","Ruben Tolosana","Imanol Solano","Alejandro Garcia","Francisco Zamora-Martinez"],"pdf_url":"https://arxiv.org/pdf/2310.02140v1.pdf","comment":"Preprint of the paper presented to the Workshop on IEEE 47th Annual\n  Computers, Software, and Applications Conference (COMPSAC, 2023)"},{"id":"http://arxiv.org/abs/2310.02129v1","updated":"2023-10-03T15:10:46Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":"  As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code will be released at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.01259v2","updated":"2023-10-03T15:08:22Z","published":"2023-10-02T14:51:10Z","title":"Faster and Accurate Neural Networks with Semantic Inference","summary":"  Deep neural networks (DNN) usually come with a significant computational\nburden. While approaches such as structured pruning and mobile-specific DNNs\nhave been proposed, they incur drastic accuracy loss. In this paper we leverage\nthe intrinsic redundancy in latent representations to reduce the computational\nload with limited loss in performance. We show that semantically similar inputs\nshare many filters, especially in the earlier layers. Thus, semantically\nsimilar classes can be clustered to create cluster-specific subgraphs. To this\nend, we propose a new framework called Semantic Inference (SINF). In short,\nSINF (i) identifies the semantic cluster the object belongs to using a small\nadditional classifier and (ii) executes the subgraph extracted from the base\nDNN related to that semantic cluster for inference. To extract each\ncluster-specific subgraph, we propose a new approach named Discriminative\nCapability Score (DCS) that finds the subgraph with the capability to\ndiscriminate among the members of a specific semantic cluster. DCS is\nindependent from SINF and can be applied to any DNN. We benchmark the\nperformance of DCS on the VGG16, VGG19, and ResNet50 DNNs trained on the\nCIFAR100 dataset against 6 state-of-the-art pruning approaches. Our results\nshow that (i) SINF reduces the inference time of VGG19, VGG16, and ResNet50\nrespectively by up to 35%, 29% and 15% with only 0.17%, 3.75%, and 6.75%\naccuracy loss (ii) DCS achieves respectively up to 3.65%, 4.25%, and 2.36%\nbetter accuracy with VGG16, VGG19, and ResNet50 with respect to existing\ndiscriminative scores (iii) when used as a pruning criterion, DCS achieves up\nto 8.13% accuracy gain with 5.82% less parameters than the existing state of\nthe art work published at ICLR 2023 (iv) when considering per-cluster accuracy,\nSINF performs on average 5.73%, 8.38% and 6.36% better than the base VGG16,\nVGG19, and ResNet50.\n","authors":["Sazzad Sayyed","Jonathan Ashdown","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2310.01259v2.pdf","comment":"14 pages, 6 figures, conference format"},{"id":"http://arxiv.org/abs/2310.02110v1","updated":"2023-10-03T14:53:53Z","published":"2023-10-03T14:53:53Z","title":"SIEVE: Multimodal Dataset Pruning Using Image Captioning Models","summary":"  Vision-Language Models (VLMs) are pretrained on large, diverse, and noisy\nweb-crawled datasets. This underscores the critical need for dataset pruning,\nas the quality of these datasets is strongly correlated with the performance of\nVLMs on downstream tasks. Using CLIPScore from a pretrained model to only train\nmodels using highly-aligned samples is one of the most successful methods for\npruning.We argue that this approach suffers from multiple limitations\nincluding: 1) false positives due to spurious correlations captured by the\npretrained CLIP model, 2) false negatives due to poor discrimination between\nhard and bad samples, and 3) biased ranking towards samples similar to the\npretrained CLIP dataset. We propose a pruning method, SIEVE, that employs\nsynthetic captions generated by image-captioning models pretrained on small,\ndiverse, and well-aligned image-text pairs to evaluate the alignment of noisy\nimage-text pairs. To bridge the gap between the limited diversity of generated\ncaptions and the high diversity of alternative text (alt-text), we estimate the\nsemantic textual similarity in the embedding space of a language model\npretrained on billions of sentences. Using DataComp, a multimodal dataset\nfiltering benchmark, we achieve state-of-the-art performance on the large scale\npool, and competitive results on the medium scale pool, surpassing\nCLIPScore-based filtering by 1.7% and 2.6% on average, on 38 downstream tasks.\n","authors":["Anas Mahmoud","Mostafa Elhoushi","Amro Abbas","Yu Yang","Newsha Ardalani","Hugh Leather","Ari Morcos"],"pdf_url":"https://arxiv.org/pdf/2310.02110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.00253v4","updated":"2023-10-03T14:48:16Z","published":"2021-04-01T04:40:22Z","title":"Deep Contrastive Patch-Based Subspace Learning for Camera Image Signal\n  Processing","summary":"  Camera Image Signal Processing (ISP) pipelines can get appealing results in\ndifferent image signal processing tasks. Nonetheless, the majority of these\nmethods, including those employing an encoder-decoder deep architecture for the\ntask, typically utilize a uniform filter applied consistently across the entire\nimage. However, it is natural to view a camera image as heterogeneous, as the\ncolor intensity and the artificial noise are distributed vastly differently,\neven across the two-dimensional domain of a single image. Varied Moire ringing,\nmotion blur, color-bleaching, or lens-based projection distortions can all\npotentially lead to a heterogeneous image artifact filtering problem. In this\npaper, we present a specific patch-based, local subspace deep neural network\nthat improves Camera ISP to be robust to heterogeneous artifacts (especially\nimage denoising). We call our three-fold deep-trained model the Patch Subspace\nLearning Autoencoder (PSL-AE). The PSL-AE model does not make assumptions\nregarding uniform levels of image distortion. Instead, it first encodes patches\nextracted from noisy a nd clean image pairs, with different artifact types or\ndistortion levels, by contrastive learning. Then, the patches of each image are\nencoded into corresponding soft clusters within their suitable latent\nsub-space, utilizing a prior mixture model. Furthermore, the decoders undergo\ntraining in an unsupervised manner, specifically trained for the image patches\npresent in each cluster. The experiments highlight the adaptability and\nefficacy through enhanced heterogeneous filtering, both from synthesized\nartifacts but also realistic SIDD image pairs.\n","authors":["Yunhao Yang","Yi Wang","Chandrajit Bajaj"],"pdf_url":"https://arxiv.org/pdf/2104.00253v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02097v1","updated":"2023-10-03T14:41:30Z","published":"2023-10-03T14:41:30Z","title":"Leveraging Classic Deconvolution and Feature Extraction in Zero-Shot\n  Image Restoration","summary":"  Non-blind deconvolution aims to restore a sharp image from its blurred\ncounterpart given an obtained kernel. Existing deep neural architectures are\noften built based on large datasets of sharp ground truth images and trained\nwith supervision. Sharp, high quality ground truth images, however, are not\nalways available, especially for biomedical applications. This severely hampers\nthe applicability of current approaches in practice. In this paper, we propose\na novel non-blind deconvolution method that leverages the power of deep\nlearning and classic iterative deconvolution algorithms. Our approach combines\na pre-trained network to extract deep features from the input image with\niterative Richardson-Lucy deconvolution steps. Subsequently, a zero-shot\noptimisation process is employed to integrate the deconvolved features,\nresulting in a high-quality reconstructed image. By performing the preliminary\nreconstruction with the classic iterative deconvolution method, we can\neffectively utilise a smaller network to produce the final image, thus\naccelerating the reconstruction whilst reducing the demand for valuable\ncomputational resources. Our method demonstrates significant improvements in\nvarious real-world applications non-blind deconvolution tasks.\n","authors":["Tomáš Chobola","Gesine Müller","Veit Dausmann","Anton Theileis","Jan Taucher","Jan Huisken","Tingying Peng"],"pdf_url":"https://arxiv.org/pdf/2310.02097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02083v1","updated":"2023-10-03T14:26:56Z","published":"2023-10-03T14:26:56Z","title":"Point Neighborhood Embeddings","summary":"  Point convolution operations rely on different embedding mechanisms to encode\nthe neighborhood information of each point in order to detect patterns in 3D\nspace. However, as convolutions are usually evaluated as a whole, not much work\nhas been done to investigate which is the ideal mechanism to encode such\nneighborhood information. In this paper, we provide the first extensive study\nthat analyzes such Point Neighborhood Embeddings (PNE) alone in a controlled\nexperimental setup. From our experiments, we derive a set of recommendations\nfor PNE that can help to improve future designs of neural network architectures\nfor point clouds. Our most surprising finding shows that the most commonly used\nembedding based on a Multi-layer Perceptron (MLP) with ReLU activation\nfunctions provides the lowest performance among all embeddings, even being\nsurpassed on some tasks by a simple linear combination of the point\ncoordinates. Additionally, we show that a neural network architecture using\nsimple convolutions based on such embeddings is able to achieve\nstate-of-the-art results on several tasks, outperforming recent and more\ncomplex operations. Lastly, we show that these findings extrapolate to other\nmore complex convolution operations, where we show how following our\nrecommendations we are able to improve recent state-of-the-art architectures.\n","authors":["Pedro Hermosilla"],"pdf_url":"https://arxiv.org/pdf/2310.02083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08250v3","updated":"2023-10-03T14:22:29Z","published":"2023-03-14T21:52:27Z","title":"Transforming Transformers for Resilient Lifelong Learning","summary":"  Lifelong learning without catastrophic forgetting (i.e., resiliency) remains\nan open problem for deep neural networks. The prior art mostly focuses on\nconvolutional neural networks. With the increasing dominance of Transformers in\ndeep learning, it is a pressing need to study lifelong learning with\nTransformers. Due to the complexity of training Transformers in practice, for\nlifelong learning, a question naturally arises: Can Transformers be learned to\ngrow in a task aware way, that is to be dynamically transformed by introducing\nlightweight learnable plastic components to the architecture, while retaining\nthe parameter-heavy, but stable components at streaming tasks? To that end,\nmotivated by the lifelong learning capability maintained by the functionality\nof Hippocampi in human brain, we explore what would be, and how to implement,\nArtificial Hippocampi (ArtiHippo) in Transformers. We present a method to\nidentify, and learn to grow, ArtiHippo in Vision Transformers (ViTs) for\nresilient lifelong learning in four aspects: (i) Where to place ArtiHippo to\nenable plasticity while preserving the core function of ViTs at streaming\ntasks? (ii) How to represent and realize ArtiHippo to ensure expressivity and\nadaptivity for tackling tasks of different nature in lifelong learning? (iii)\nHow to learn to grow ArtiHippo to exploit task synergies (i.e., the learned\nknowledge) and overcome catastrophic forgetting? (iv) How to harness the best\nof our proposed ArtiHippo and prompting-based approaches? In experiments, we\ntest the proposed method on the challenging Visual Domain Decathlon (VDD)\nbenchmark and the 5-Dataset benchmark under the task-incremental lifelong\nlearning setting. It obtains consistently better performance than the prior art\nwith sensible ArtiHippo learned continually. To our knowledge, it is the first\nattempt of lifelong learning with ViTs on the challenging VDD benchmark.\n","authors":["Chinmay Savadikar","Michelle Dai","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2303.08250v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02071v1","updated":"2023-10-03T14:13:36Z","published":"2023-10-03T14:13:36Z","title":"Towards End-to-End Embodied Decision Making via Multi-modal Large\n  Language Model: Explorations with GPT4-Vision and Beyond","summary":"  In this study, we explore the potential of Multimodal Large Language Models\n(MLLMs) in improving embodied decision-making processes for agents. While Large\nLanguage Models (LLMs) have been widely used due to their advanced reasoning\nskills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual\nunderstanding and reasoning capabilities. We investigate whether\nstate-of-the-art MLLMs can handle embodied decision-making in an end-to-end\nmanner and whether collaborations between LLMs and MLLMs can enhance\ndecision-making. To address these questions, we introduce a new benchmark\ncalled PCA-EVAL, which evaluates embodied decision-making from the perspectives\nof Perception, Cognition, and Action. Additionally, we propose HOLMES, a\nmulti-agent cooperation framework that allows LLMs to leverage MLLMs and APIs\nto gather multimodal information for informed decision-making. We compare\nend-to-end embodied decision-making and HOLMES on our benchmark and find that\nthe GPT4-Vision model demonstrates strong end-to-end embodied decision-making\nabilities, outperforming GPT4-HOLMES in terms of average decision accuracy\n(+3%). However, this performance is exclusive to the latest GPT4-Vision model,\nsurpassing the open-source state-of-the-art MLLM by 26%. Our results indicate\nthat powerful MLLMs like GPT4-Vision hold promise for decision-making in\nembodied agents, offering new avenues for MLLM research.\n","authors":["Liang Chen","Yichi Zhang","Shuhuai Ren","Haozhe Zhao","Zefan Cai","Yuchi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.02071v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.16916v2","updated":"2023-10-03T14:10:02Z","published":"2023-09-29T01:07:38Z","title":"ONNXExplainer: an ONNX Based Generic Framework to Explain Neural\n  Networks Using Shapley Values","summary":"  Understanding why a neural network model makes certain decisions can be as\nimportant as the inference performance. Various methods have been proposed to\nhelp practitioners explain the prediction of a neural network model, of which\nShapley values are most popular. SHAP package is a leading implementation of\nShapley values to explain neural networks implemented in TensorFlow or PyTorch\nbut lacks cross-platform support, one-shot deployment and is highly\ninefficient. To address these problems, we present the ONNXExplainer, which is\na generic framework to explain neural networks using Shapley values in the ONNX\necosystem. In ONNXExplainer, we develop its own automatic differentiation and\noptimization approach, which not only enables One-Shot Deployment of neural\nnetworks inference and explanations, but also significantly improves the\nefficiency to compute explanation with less memory consumption. For fair\ncomparison purposes, we also implement the same optimization in TensorFlow and\nPyTorch and measure its performance against the current state of the art\nopen-source counterpart, SHAP. Extensive benchmarks demonstrate that the\nproposed optimization approach improves the explanation latency of VGG19,\nResNet50, DenseNet201, and EfficientNetB0 by as much as 500%.\n","authors":["Yong Zhao","Runxin He","Nicholas Kersting","Can Liu","Shubham Agrawal","Chiranjeet Chetia","Yu Gu"],"pdf_url":"https://arxiv.org/pdf/2309.16916v2.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.02067v1","updated":"2023-10-03T14:09:27Z","published":"2023-10-03T14:09:27Z","title":"Content Bias in Deep Learning Age Approximation: A new Approach Towards\n  more Explainability","summary":"  In the context of temporal image forensics, it is not evident that a neural\nnetwork, trained on images from different time-slots (classes), exploit solely\nage related features. Usually, images taken in close temporal proximity (e.g.,\nbelonging to the same age class) share some common content properties. Such\ncontent bias can be exploited by a neural network. In this work, a novel\napproach that evaluates the influence of image content is proposed. This\napproach is verified using synthetic images (where content bias can be ruled\nout) with an age signal embedded. Based on the proposed approach, it is shown\nthat a `standard' neural network trained in the context of age classification\nis strongly dependent on image content. As a potential countermeasure, two\ndifferent techniques are applied to mitigate the influence of the image content\nduring training, and they are also evaluated by the proposed method.\n","authors":["Robert Jöchl","Andreas Uhl"],"pdf_url":"https://arxiv.org/pdf/2310.02067v1.pdf","comment":"This is a preprint, the paper is currently under consideration at\n  Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2310.02060v1","updated":"2023-10-03T14:03:20Z","published":"2023-10-03T14:03:20Z","title":"Global Attractor for a Reaction-Diffusion Model Arising in Biological\n  Dynamic in 3D Soil Structure","summary":"  Partial Differential Equations (PDEs) play a crucial role as tools for\nmodeling and comprehending intricate natural processes, notably within the\ndomain of biology. This research explores the domain of microbial activity\nwithin the complex matrix of 3D soil structures, providing valuable\nunderstanding into both the existence and uniqueness of solutions and the\nasymptotic behavior of the corresponding PDE model. Our investigation results\nin the discovery of a global attractor, a fundamental feature with significant\nimplications for long-term system behavior. To enhance the clarity of our\nfindings, numerical simulations are employed to visually illustrate the\nattributes of this global attractor.\n","authors":["Mohamed Elghandouri","Khalil Ezzinbi","Mouad Klai","Olivier Monga"],"pdf_url":"https://arxiv.org/pdf/2310.02060v1.pdf","comment":"Preprint submitted to Mathematical Geosciences"},{"id":"http://arxiv.org/abs/2310.02050v1","updated":"2023-10-03T13:43:50Z","published":"2023-10-03T13:43:50Z","title":"Tuning Large language model for End-to-end Speech Translation","summary":"  With the emergence of large language models (LLMs), multimodal models based\non LLMs have demonstrated significant potential. Models such as LLaSM, X-LLM,\nand SpeechGPT exhibit an impressive ability to comprehend and generate human\ninstructions. However, their performance often falters when faced with complex\ntasks like end-to-end speech translation (E2E-ST), a cross-language and\ncross-modal translation task. In comparison to single-modal models, multimodal\nmodels lag behind in these scenarios. This paper introduces LST, a Large\nmultimodal model designed to excel at the E2E-ST task. LST consists of a speech\nfrontend, an adapter, and a LLM backend. The training of LST consists of two\nstages: (1) Modality adjustment, where the adapter is tuned to align speech\nrepresentation with text embedding space, and (2) Downstream task fine-tuning,\nwhere both the adapter and LLM model are trained to optimize performance on the\nE2EST task. Experimental results on the MuST-C speech translation benchmark\ndemonstrate that LST-13B achieves BLEU scores of 30.39/41.55/35.33 on\nEn-De/En-Fr/En-Es language pairs, surpassing previous models and establishing a\nnew state-of-the-art. Additionally, we conduct an in-depth analysis of\nsingle-modal model selection and the impact of training strategies, which lays\nthe foundation for future research. We will open up our code and models after\nreview.\n","authors":["Hao Zhang","Nianwen Si","Yaqi Chen","Wenlin Zhang","Xukui Yang","Dan Qu","Xiaolin Jiao"],"pdf_url":"https://arxiv.org/pdf/2310.02050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02048v1","updated":"2023-10-03T13:41:13Z","published":"2023-10-03T13:41:13Z","title":"Exploring Generalisability of Self-Distillation with No Labels for\n  SAR-Based Vegetation Prediction","summary":"  In this work we pre-train a DINO-ViT based model using two Synthetic Aperture\nRadar datasets (S1GRD or GSSIC) across three regions (China, Conus, Europe). We\nfine-tune the models on smaller labeled datasets to predict vegetation\npercentage, and empirically study the connection between the embedding space of\nthe models and their ability to generalize across diverse geographic regions\nand to unseen data. For S1GRD, embedding spaces of different regions are\nclearly separated, while GSSIC's overlaps. Positional patterns remain during\nfine-tuning, and greater distances in embeddings often result in higher errors\nfor unfamiliar regions. With this, our work increases our understanding of\ngeneralizability for self-supervised models applied to remote sensing.\n","authors":["Laura Martínez-Ferrer","Anna Jungbluth","Joseph A. Gallego-Mejia","Matt Allen","Francisco Dorr","Freddie Kalaitzis","Raúl Ramos-Pollán"],"pdf_url":"https://arxiv.org/pdf/2310.02048v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.02044v1","updated":"2023-10-03T13:35:49Z","published":"2023-10-03T13:35:49Z","title":"Video Transformers under Occlusion: How Physics and Background\n  Attributes Impact Large Models for Robotic Manipulation","summary":"  As transformer architectures and dataset sizes continue to scale, the need to\nunderstand the specific dataset factors affecting model performance becomes\nincreasingly urgent. This paper investigates how object physics attributes\n(color, friction coefficient, shape) and background characteristics (static,\ndynamic, background complexity) influence the performance of Video Transformers\nin trajectory prediction tasks under occlusion. Beyond mere occlusion\nchallenges, this study aims to investigate three questions: How do object\nphysics attributes and background characteristics influence the model\nperformance? What kinds of attributes are most influential to the model\ngeneralization? Is there a data saturation point for large transformer model\nperformance within a single task? To facilitate this research, we present\nOccluManip, a real-world video-based robot pushing dataset comprising 460,000\nconsistent recordings of objects with different physics and varying\nbackgrounds. 1.4 TB and in total 1278 hours of high-quality videos of flexible\ntemporal length along with target object trajectories are collected,\naccommodating tasks with different temporal requirements. Additionally, we\npropose Video Occlusion Transformer (VOT), a generic video-transformer-based\nnetwork achieving an average 96% accuracy across all 18 sub-datasets provided\nin OccluManip. OccluManip and VOT will be released at:\nhttps://github.com/ShutongJIN/OccluManip.git\n","authors":["Shutong Jin","Ruiyu Wang","Muhammad Zahid","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2310.02044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02037v1","updated":"2023-10-03T13:28:14Z","published":"2023-10-03T13:28:14Z","title":"An evaluation of pre-trained models for feature extraction in image\n  classification","summary":"  In recent years, we have witnessed a considerable increase in performance in\nimage classification tasks. This performance improvement is mainly due to the\nadoption of deep learning techniques. Generally, deep learning techniques\ndemand a large set of annotated data, making it a challenge when applying it to\nsmall datasets. In this scenario, transfer learning strategies have become a\npromising alternative to overcome these issues. This work aims to compare the\nperformance of different pre-trained neural networks for feature extraction in\nimage classification tasks. We evaluated 16 different pre-trained models in\nfour image datasets. Our results demonstrate that the best general performance\nalong the datasets was achieved by CLIP-ViT-B and ViT-H-14, where the\nCLIP-ResNet50 model had similar performance but with less variability.\nTherefore, our study provides evidence supporting the choice of models for\nfeature extraction in image classification tasks.\n","authors":["Erick da Silva Puls","Matheus V. Todescato","Joel L. Carbonera"],"pdf_url":"https://arxiv.org/pdf/2310.02037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16412v2","updated":"2023-10-03T13:18:29Z","published":"2023-07-31T05:38:17Z","title":"RCS-YOLO: A Fast and High-Accuracy Object Detector for Brain Tumor\n  Detection","summary":"  With an excellent balance between speed and accuracy, cutting-edge YOLO\nframeworks have become one of the most efficient algorithms for object\ndetection. However, the performance of using YOLO networks is scarcely\ninvestigated in brain tumor detection. We propose a novel YOLO architecture\nwith Reparameterized Convolution based on channel Shuffle (RCS-YOLO). We\npresent RCS and a One-Shot Aggregation of RCS (RCS-OSA), which link feature\ncascade and computation efficiency to extract richer information and reduce\ntime consumption. Experimental results on the brain tumor dataset Br35H show\nthat the proposed model surpasses YOLOv6, YOLOv7, and YOLOv8 in speed and\naccuracy. Notably, compared with YOLOv7, the precision of RCS-YOLO improves by\n1%, and the inference speed by 60% at 114.8 images detected per second (FPS).\nOur proposed RCS-YOLO achieves state-of-the-art performance on the brain tumor\ndetection task. The code is available at https://github.com/mkang315/RCS-YOLO.\n","authors":["Ming Kang","Chee-Ming Ting","Fung Fung Ting","Raphaël C. -W. Phan"],"pdf_url":"https://arxiv.org/pdf/2307.16412v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.11299v5","updated":"2023-10-03T13:08:50Z","published":"2021-08-25T15:49:10Z","title":"Certifiers Make Neural Networks Vulnerable to Availability Attacks","summary":"  To achieve reliable, robust, and safe AI systems, it is vital to implement\nfallback strategies when AI predictions cannot be trusted. Certifiers for\nneural networks are a reliable way to check the robustness of these\npredictions. They guarantee for some predictions that a certain class of\nmanipulations or attacks could not have changed the outcome. For the remaining\npredictions without guarantees, the method abstains from making a prediction,\nand a fallback strategy needs to be invoked, which typically incurs additional\ncosts, can require a human operator, or even fail to provide any prediction.\nWhile this is a key concept towards safe and secure AI, we show for the first\ntime that this approach comes with its own security risks, as such fallback\nstrategies can be deliberately triggered by an adversary. In addition to\nnaturally occurring abstains for some inputs and perturbations, the adversary\ncan use training-time attacks to deliberately trigger the fallback with high\nprobability. This transfers the main system load onto the fallback, reducing\nthe overall system's integrity and/or availability. We design two novel\navailability attacks, which show the practical relevance of these threats. For\nexample, adding 1% poisoned data during training is sufficient to trigger the\nfallback and hence make the model unavailable for up to 100% of all inputs by\ninserting the trigger. Our extensive experiments across multiple datasets,\nmodel architectures, and certifiers demonstrate the broad applicability of\nthese attacks. An initial investigation into potential defenses shows that\ncurrent approaches are insufficient to mitigate the issue, highlighting the\nneed for new, specific solutions.\n","authors":["Tobias Lorenz","Marta Kwiatkowska","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2108.11299v5.pdf","comment":"Published at 16th ACM Workshop on Artificial Intelligence and\n  Security (AISec '23)"},{"id":"http://arxiv.org/abs/2305.18403v3","updated":"2023-10-03T12:51:55Z","published":"2023-05-28T15:15:48Z","title":"LoRAPrune: Pruning Meets Low-Rank Parameter-Efficient Fine-Tuning","summary":"  Large pre-trained models (LPMs), such as LLaMA and GLM, have shown\nexceptional performance across various tasks through fine-tuning. Although\nlow-rank adaption (LoRA) has emerged to cheaply fine-tune these LPMs on\ndownstream tasks, their deployment is still hindered by the vast model scale\nand computational costs. Neural network pruning offers a way to compress LPMs.\nHowever, the current pruning methods designed for LPMs are not compatible with\nLoRA. This is due to their utilization of unstructured pruning on LPMs,\nimpeding the merging of LoRA weights, or their dependence on the gradients of\npre-trained weights to guide pruning, which can impose significant memory\noverhead. To this end, we propose LoRAPrune, a new framework that delivers an\naccurate, compact model for efficient inference in a highly memory-effective\nmanner. Specifically, we first design a LoRA-guided pruning criterion, which\nuses the weights and gradients of LoRA, rather than the gradients of\npre-trained weights for importance estimation. We then propose a structured\niterative pruning procedure, to remove redundant channels and heads. Extensive\nexperimental results demonstrate the superior performance of our LoRAPrune over\nexisting approaches on the LLaMA series models. For instance, at a 50\\%\ncompression rate, LoRAPrune outperforms LLM-Pruner by a perplexity reduction of\n8.0 on WikiText2 and 16.05 on PTB datasets, while concurrently reducing memory\nusage by 52.6\\%. The code will be released after review\n","authors":["Mingyang Zhang","Hao Chen","Chunhua Shen","Zhen Yang","Linlin Ou","Xinyi Yu","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2305.18403v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02011v1","updated":"2023-10-03T12:34:31Z","published":"2023-10-03T12:34:31Z","title":"Decoding Human Activities: Analyzing Wearable Accelerometer and\n  Gyroscope Data for Activity Recognition","summary":"  A person's movement or relative positioning effectively generates raw\nelectrical signals that can be read by computing machines to apply various\nmanipulative techniques for the classification of different human activities.\nIn this paper, a stratified multi-structural approach based on a Residual\nnetwork ensembled with Residual MobileNet is proposed, termed as FusionActNet.\nThe proposed method involves using carefully designed Residual blocks for\nclassifying the static and dynamic activities separately because they have\nclear and distinct characteristics that set them apart. These networks are\ntrained independently, resulting in two specialized and highly accurate models.\nThese models excel at recognizing activities within a specific superclass by\ntaking advantage of the unique algorithmic benefits of architectural\nadjustments. Afterward, these two ResNets are passed through a weighted\nensemble-based Residual MobileNet. Subsequently, this ensemble proficiently\ndiscriminates between a specific static and a specific dynamic activity, which\nwere previously identified based on their distinct feature characteristics in\nthe earlier stage. The proposed model is evaluated using two publicly\naccessible datasets; namely, UCI HAR and Motion-Sense. Therein, it successfully\nhandled the highly confusing cases of data overlap. Therefore, the proposed\napproach achieves a state-of-the-art accuracy of 96.71% and 95.35% in the UCI\nHAR and Motion-Sense datasets respectively.\n","authors":["Utsab Saha","Sawradip Saha","Tahmid Kabir","Shaikh Anowarul Fattah","Mohammad Saquib"],"pdf_url":"https://arxiv.org/pdf/2310.02011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02000v1","updated":"2023-10-03T12:19:19Z","published":"2023-10-03T12:19:19Z","title":"MUSCLE: Multi-task Self-supervised Continual Learning to Pre-train Deep\n  Models for X-ray Images of Multiple Body Parts","summary":"  While self-supervised learning (SSL) algorithms have been widely used to\npre-train deep models, few efforts [11] have been done to improve\nrepresentation learning of X-ray image analysis with SSL pre-trained models. In\nthis work, we study a novel self-supervised pre-training pipeline, namely\nMulti-task Self-super-vised Continual Learning (MUSCLE), for multiple medical\nimaging tasks, such as classification and segmentation, using X-ray images\ncollected from multiple body parts, including heads, lungs, and bones.\nSpecifically, MUSCLE aggregates X-rays collected from multiple body parts for\nMoCo-based representation learning, and adopts a well-designed continual\nlearning (CL) procedure to further pre-train the backbone subject various X-ray\nanalysis tasks jointly. Certain strategies for image pre-processing, learning\nschedules, and regularization have been used to solve data heterogeneity,\noverfitting, and catastrophic forgetting problems for multi-task/dataset\nlearning in MUSCLE.We evaluate MUSCLE using 9 real-world X-ray datasets with\nvarious tasks, including pneumonia classification, skeletal abnormality\nclassification, lung segmentation, and tuberculosis (TB) detection. Comparisons\nagainst other pre-trained models [7] confirm the proof-of-concept that\nself-supervised multi-task/dataset continual pre-training could boost the\nperformance of X-ray image analysis.\n","authors":["Weibin Liao","Haoyi Xiong","Qingzhong Wang","Yan Mo","Xuhong Li","Yi Liu","Zeyu Chen","Siyu Huang","Dejing Dou"],"pdf_url":"https://arxiv.org/pdf/2310.02000v1.pdf","comment":"accepted by Medical Image Computing and Computer Assisted\n  Intervention (MICCAI) 2022"},{"id":"http://arxiv.org/abs/2310.01994v1","updated":"2023-10-03T12:08:15Z","published":"2023-10-03T12:08:15Z","title":"Understanding Masked Autoencoders From a Local Contrastive Perspective","summary":"  Masked AutoEncoder(MAE) has revolutionized the field of self-supervised\nlearning with its simple yet effective masking and reconstruction strategies.\nHowever, despite achieving state-of-the-art performance across various\ndownstream vision tasks, the underlying mechanisms that drive MAE's efficacy\nare less well-explored compared to the canonical contrastive learning paradigm.\nIn this paper, we explore a new perspective to explain what truly contributes\nto the \"rich hidden representations inside the MAE\". Firstly, concerning MAE's\ngenerative pretraining pathway, with a unique encoder-decoder architecture to\nreconstruct images from aggressive masking, we conduct an in-depth analysis of\nthe decoder's behaviors. We empirically find that MAE's decoder mainly learns\nlocal features with a limited receptive field, adhering to the well-known\nLocality Principle. Building upon this locality assumption, we propose a\ntheoretical framework that reformulates the reconstruction-based MAE into a\nlocal region-level contrastive learning form for improved understanding.\nFurthermore, to substantiate the local contrastive nature of MAE, we introduce\na Siamese architecture that combines the essence of MAE and contrastive\nlearning without masking and explicit decoder, which sheds light on a unified\nand more flexible self-supervised learning framework.\n","authors":["Xiaoyu Yue","Lei Bai","Meng Wei","Jiangmiao Pang","Xihui Liu","Luping Zhou","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.01994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01995v1","updated":"2023-10-03T12:08:15Z","published":"2023-10-03T12:08:15Z","title":"Development of Machine Vision Approach for Mechanical Component\n  Identification based on its Dimension and Pitch","summary":"  In this work, a highly customizable and scalable vision based system for\nautomation of mechanical assembly lines is described. The proposed system\ncalculates the features that are required to classify and identify the\ndifferent kinds of bolts that are used in the assembly line. The system\ndescribes a novel method of calculating the pitch of the bolt in addition to\nbolt identification and calculating the dimensions of the bolts. This\nidentification and classification system is extremely lightweight and can be\nrun on bare minimum hardware. The system is very fast in the order of\nmilliseconds, hence the system can be used successfully even if the components\nare steadily moving on a conveyor. The results show that our system can\ncorrectly identify the parts in our dataset with 98% accuracy using the\ncalculated features.\n","authors":["Toshit Jain","Faisel Mushtaq","K Ramesh","Sandip Deshmukh","Tathagata Ray","Chandu Parimi","Praveen Tandon","Pramod Kumar Jha"],"pdf_url":"https://arxiv.org/pdf/2310.01995v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2309.11500v3","updated":"2023-10-03T11:37:40Z","published":"2023-09-20T17:59:32Z","title":"A Large-scale Dataset for Audio-Language Representation Learning","summary":"  The AI community has made significant strides in developing powerful\nfoundation models, driven by large-scale multimodal datasets. However, in the\naudio representation learning community, the present audio-language datasets\nsuffer from limitations such as insufficient volume, simplistic content, and\narduous collection procedures. To tackle these challenges, we present an\ninnovative and automatic audio caption generation pipeline based on a series of\npublic tools or APIs, and construct a large-scale, high-quality, audio-language\ndataset, named as Auto-ACD, comprising over 1.9M audio-text pairs. To\ndemonstrate the effectiveness of the proposed dataset, we train popular models\non our dataset and show performance improvement on various downstream tasks,\nnamely, audio-language retrieval, audio captioning, environment classification.\nIn addition, we establish a novel test set and provide a benchmark for\naudio-text tasks. The proposed dataset will be released at\nhttps://auto-acd.github.io/.\n","authors":["Luoyi Sun","Xuenan Xu","Mengyue Wu","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2309.11500v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01957v1","updated":"2023-10-03T11:05:14Z","published":"2023-10-03T11:05:14Z","title":"Driving with LLMs: Fusing Object-Level Vector Modality for Explainable\n  Autonomous Driving","summary":"  Large Language Models (LLMs) have shown promise in the autonomous driving\nsector, particularly in generalization and interpretability. We introduce a\nunique object-level multimodal LLM architecture that merges vectorized numeric\nmodalities with a pre-trained LLM to improve context understanding in driving\nsituations. We also present a new dataset of 160k QA pairs derived from 10k\ndriving scenarios, paired with high quality control commands collected with RL\nagent and question answer pairs generated by teacher LLM (GPT-3.5). A distinct\npretraining strategy is devised to align numeric vector modalities with static\nLLM representations using vector captioning language data. We also introduce an\nevaluation metric for Driving QA and demonstrate our LLM-driver's proficiency\nin interpreting driving scenarios, answering questions, and decision-making.\nOur findings highlight the potential of LLM-based driving action generation in\ncomparison to traditional behavioral cloning. We make our benchmark, datasets,\nand model available for further exploration.\n","authors":["Long Chen","Oleg Sinavski","Jan Hünermann","Alice Karnsund","Andrew James Willmott","Danny Birch","Daniel Maund","Jamie Shotton"],"pdf_url":"https://arxiv.org/pdf/2310.01957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10616v4","updated":"2023-10-03T11:01:55Z","published":"2023-05-18T00:04:38Z","title":"Evaluation Metrics for DNNs Compression","summary":"  There is a lot of ongoing research effort into developing different\ntechniques for neural networks compression. However, the community lacks\nstandardised evaluation metrics, which are key to identifying the most suitable\ncompression technique for different applications. This paper reviews existing\nneural network compression evaluation metrics and implements them into a\nstandardisation framework called NetZIP. We introduce two novel metrics to\ncover existing gaps of evaluation in the literature: 1) Compression and\nHardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success\n(OCS). We demonstrate the use of NetZIP using two case studies on two different\nhardware platforms (a PC and a Raspberry Pi 4) focusing on object\nclassification and object detection.\n","authors":["Abanoub Ghobrial","Samuel Budgett","Dieter Balemans","Hamid Asgari","Phil Reiter","Kerstin Eder"],"pdf_url":"https://arxiv.org/pdf/2305.10616v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01946v1","updated":"2023-10-03T10:45:37Z","published":"2023-10-03T10:45:37Z","title":"CoralVOS: Dataset and Benchmark for Coral Video Segmentation","summary":"  Coral reefs formulate the most valuable and productive marine ecosystems,\nproviding habitat for many marine species. Coral reef surveying and analysis\nare currently confined to coral experts who invest substantial effort in\ngenerating comprehensive and dependable reports (\\emph{e.g.}, coral coverage,\npopulation, spatial distribution, \\textit{etc}), from the collected survey\ndata. However, performing dense coral analysis based on manual efforts is\nsignificantly time-consuming, the existing coral analysis algorithms compromise\nand opt for performing down-sampling and only conducting sparse point-based\ncoral analysis within selected frames. However, such down-sampling will\n\\textbf{inevitable} introduce the estimation bias or even lead to wrong\nresults. To address this issue, we propose to perform \\textbf{dense coral video\nsegmentation}, with no down-sampling involved. Through video object\nsegmentation, we could generate more \\textit{reliable} and \\textit{in-depth}\ncoral analysis than the existing coral reef analysis algorithms. To boost such\ndense coral analysis, we propose a large-scale coral video segmentation\ndataset: \\textbf{CoralVOS} as demonstrated in Fig. 1. To the best of our\nknowledge, our CoralVOS is the first dataset and benchmark supporting dense\ncoral video segmentation. We perform experiments on our CoralVOS dataset,\nincluding 6 recent state-of-the-art video object segmentation (VOS) algorithms.\nWe fine-tuned these VOS algorithms on our CoralVOS dataset and achieved\nobservable performance improvement. The results show that there is still great\npotential for further promoting the segmentation accuracy. The dataset and\ntrained models will be released with the acceptance of this work to foster the\ncoral reef research community.\n","authors":["Zheng Ziqiang","Xie Yaofeng","Liang Haixin","Yu Zhibin","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2310.01946v1.pdf","comment":"8 pages, 9 figures, dense coral video segmentation dataset and\n  benchmark"},{"id":"http://arxiv.org/abs/2310.01942v1","updated":"2023-10-03T10:38:39Z","published":"2023-10-03T10:38:39Z","title":"OOD Aware Supervised Contrastive Learning","summary":"  Out-of-Distribution (OOD) detection is a crucial problem for the safe\ndeployment of machine learning models identifying samples that fall outside of\nthe training distribution, i.e. in-distribution data (ID). Most OOD works focus\non the classification models trained with Cross Entropy (CE) and attempt to fix\nits inherent issues. In this work we leverage powerful representation learned\nwith Supervised Contrastive (SupCon) training and propose a holistic approach\nto learn a classifier robust to OOD data. We extend SupCon loss with two\nadditional contrast terms. The first term pushes auxiliary OOD representations\naway from ID representations without imposing any constraints on similarities\namong auxiliary data. The second term pushes OOD features far from the existing\nclass prototypes, while pushing ID representations closer to their\ncorresponding class prototype. When auxiliary OOD data is not available, we\npropose feature mixing techniques to efficiently generate pseudo-OOD features.\nOur solution is simple and efficient and acts as a natural extension of the\nclosed-set supervised contrastive representation learning. We compare against\ndifferent OOD detection methods on the common benchmarks and show\nstate-of-the-art results.\n","authors":["Soroush Seifi","Daniel Olmeda Reino","Nikolay Chumerin","Rahaf Aljundi"],"pdf_url":"https://arxiv.org/pdf/2310.01942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15514v2","updated":"2023-10-03T10:30:26Z","published":"2023-07-28T12:16:31Z","title":"Revisiting Fully Convolutional Geometric Features for Object 6D Pose\n  Estimation","summary":"  Recent works on 6D object pose estimation focus on learning keypoint\ncorrespondences between images and object models, and then determine the object\npose through RANSAC-based algorithms or by directly regressing the pose with\nend-to-end optimisations. We argue that learning point-level discriminative\nfeatures is overlooked in the literature. To this end, we revisit Fully\nConvolutional Geometric Features (FCGF) and tailor it for object 6D pose\nestimation to achieve state-of-the-art performance. FCGF employs sparse\nconvolutions and learns point-level features using a fully-convolutional\nnetwork by optimising a hardest contrastive loss. We can outperform recent\ncompetitors on popular benchmarks by adopting key modifications to the loss and\nto the input data representations, by carefully tuning the training strategies,\nand by employing data augmentations suitable for the underlying problem. We\ncarry out a thorough ablation to study the contribution of each modification.\nThe code is available at https://github.com/jcorsetti/FCGF6D.\n","authors":["Jaime Corsetti","Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2307.15514v2.pdf","comment":"Camera ready version, 18 pages and 13 figures. Published at the 8th\n  International Workshop on Recovering 6D Object Pose"},{"id":"http://arxiv.org/abs/2310.01936v1","updated":"2023-10-03T10:23:28Z","published":"2023-10-03T10:23:28Z","title":"Constructing Image-Text Pair Dataset from Books","summary":"  Digital archiving is becoming widespread owing to its effectiveness in\nprotecting valuable books and providing knowledge to many people\nelectronically. In this paper, we propose a novel approach to leverage digital\narchives for machine learning. If we can fully utilize such digitized data,\nmachine learning has the potential to uncover unknown insights and ultimately\nacquire knowledge autonomously, just like humans read books. As a first step,\nwe design a dataset construction pipeline comprising an optical character\nreader (OCR), an object detector, and a layout analyzer for the autonomous\nextraction of image-text pairs. In our experiments, we apply our pipeline on\nold photo books to construct an image-text pair dataset, showing its\neffectiveness in image-text retrieval and insight extraction.\n","authors":["Yamato Okamoto","Haruto Toyonaga","Yoshihisa Ijiri","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2310.01936v1.pdf","comment":"Accepted at ICCV 2023 workshop, Towards the Next Generation of\n  Computer Vision Datasets: General DataCentric Submission Track"},{"id":"http://arxiv.org/abs/2310.01934v1","updated":"2023-10-03T10:17:49Z","published":"2023-10-03T10:17:49Z","title":"Robust deformable image registration using cycle-consistent implicit\n  representations","summary":"  Recent works in medical image registration have proposed the use of Implicit\nNeural Representations, demonstrating performance that rivals state-of-the-art\nlearning-based methods. However, these implicit representations need to be\noptimized for each new image pair, which is a stochastic process that may fail\nto converge to a global minimum. To improve robustness, we propose a deformable\nregistration method using pairs of cycle-consistent Implicit Neural\nRepresentations: each implicit representation is linked to a second implicit\nrepresentation that estimates the opposite transformation, causing each network\nto act as a regularizer for its paired opposite. During inference, we generate\nmultiple deformation estimates by numerically inverting the paired backward\ntransformation and evaluating the consensus of the optimized pair. This\nconsensus improves registration accuracy over using a single representation and\nresults in a robust uncertainty metric that can be used for automatic quality\ncontrol. We evaluate our method with a 4D lung CT dataset. The proposed\ncycle-consistent optimization method reduces the optimization failure rate from\n2.4% to 0.0% compared to the current state-of-the-art. The proposed inference\nmethod improves landmark accuracy by 4.5% and the proposed uncertainty metric\ndetects all instances where the registration method fails to converge to a\ncorrect solution. We verify the generalizability of these results to other data\nusing a centerline propagation task in abdominal 4D MRI, where our method\nachieves a 46% improvement in propagation consistency compared with single-INR\nregistration and demonstrates a strong correlation between the proposed\nuncertainty metric and registration accuracy.\n","authors":["Louis D. van Harten","Jaap Stoker","Ivana Išgum"],"pdf_url":"https://arxiv.org/pdf/2310.01934v1.pdf","comment":"10 pages, 9 figures, accepted in IEEE Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2310.01931v1","updated":"2023-10-03T10:13:42Z","published":"2023-10-03T10:13:42Z","title":"MarineDet: Towards Open-Marine Object Detection","summary":"  Marine object detection has gained prominence in marine research, driven by\nthe pressing need to unravel oceanic mysteries and enhance our understanding of\ninvaluable marine ecosystems. There is a profound requirement to efficiently\nand accurately identify and localize diverse and unseen marine entities within\nunderwater imagery. The open-marine object detection (OMOD for short) is\nrequired to detect diverse and unseen marine objects, performing categorization\nand localization simultaneously. To achieve OMOD, we present\n\\textbf{MarineDet}. We formulate a joint visual-text semantic space through\npre-training and then perform marine-specific training to achieve\nin-air-to-marine knowledge transfer. Considering there is no specific dataset\ndesigned for OMOD, we construct a \\textbf{MarineDet dataset} consisting of 821\nmarine-relative object categories to promote and measure OMOD performance. The\nexperimental results demonstrate the superior performance of MarineDet over\nexisting generalist and specialist object detection algorithms. To the best of\nour knowledge, we are the first to present OMOD, which holds a more valuable\nand practical setting for marine ecosystem monitoring and management. Our\nresearch not only pushes the boundaries of marine understanding but also offers\na standard pipeline for OMOD.\n","authors":["Liang Haixin","Zheng Ziqiang","Ma Zeyu","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2310.01931v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.01926v1","updated":"2023-10-03T10:10:42Z","published":"2023-10-03T10:10:42Z","title":"DARTH: Holistic Test-time Adaptation for Multiple Object Tracking","summary":"  Multiple object tracking (MOT) is a fundamental component of perception\nsystems for autonomous driving, and its robustness to unseen conditions is a\nrequirement to avoid life-critical failures. Despite the urge of safety in\ndriving systems, no solution to the MOT adaptation problem to domain shift in\ntest-time conditions has ever been proposed. However, the nature of a MOT\nsystem is manifold - requiring object detection and instance association - and\nadapting all its components is non-trivial. In this paper, we analyze the\neffect of domain shift on appearance-based trackers, and introduce DARTH, a\nholistic test-time adaptation framework for MOT. We propose a detection\nconsistency formulation to adapt object detection in a self-supervised fashion,\nwhile adapting the instance appearance representations via our novel patch\ncontrastive loss. We evaluate our method on a variety of domain shifts -\nincluding sim-to-real, outdoor-to-indoor, indoor-to-outdoor - and substantially\nimprove the source model performance on all metrics. Code:\nhttps://github.com/mattiasegu/darth.\n","authors":["Mattia Segu","Bernt Schiele","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2310.01926v1.pdf","comment":"Proceedings of the IEEE/CVF International Conference on Computer\n  Vision"},{"id":"http://arxiv.org/abs/2307.00724v4","updated":"2023-10-03T10:07:26Z","published":"2023-07-03T03:09:44Z","title":"LXL: LiDAR Excluded Lean 3D Object Detection with 4D Imaging Radar and\n  Camera Fusion","summary":"  As an emerging technology and a relatively affordable device, the 4D imaging\nradar has already been confirmed effective in performing 3D object detection in\nautonomous driving. Nevertheless, the sparsity and noisiness of 4D radar point\nclouds hinder further performance improvement, and in-depth studies about its\nfusion with other modalities are lacking. On the other hand, as a new image\nview transformation strategy, \"sampling\" has been applied in a few image-based\ndetectors and shown to outperform the widely applied \"depth-based splatting\"\nproposed in Lift-Splat-Shoot (LSS), even without image depth prediction.\nHowever, the potential of \"sampling\" is not fully unleashed. This paper\ninvestigates the \"sampling\" view transformation strategy on the camera and 4D\nimaging radar fusion-based 3D object detection. LiDAR Excluded Lean (LXL)\nmodel, predicted image depth distribution maps and radar 3D occupancy grids are\ngenerated from image perspective view (PV) features and radar bird's eye view\n(BEV) features, respectively. They are sent to the core of LXL, called \"radar\noccupancy-assisted depth-based sampling\", to aid image view transformation. We\ndemonstrated that more accurate view transformation can be performed by\nintroducing image depths and radar information to enhance the \"sampling\"\nstrategy. Experiments on VoD and TJ4DRadSet datasets show that the proposed\nmethod outperforms the state-of-the-art 3D object detection methods by a\nsignificant margin without bells and whistles. Ablation studies demonstrate\nthat our method performs the best among different enhancement settings.\n","authors":["Weiyi Xiong","Jianan Liu","Tao Huang","Qing-Long Han","Yuxuan Xia","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.00724v4.pdf","comment":"Accepted by IEEE Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2310.01924v1","updated":"2023-10-03T09:59:59Z","published":"2023-10-03T09:59:59Z","title":"RoFormer for Position Aware Multiple Instance Learning in Whole Slide\n  Image Classification","summary":"  Whole slide image (WSI) classification is a critical task in computational\npathology. However, the gigapixel-size of such images remains a major challenge\nfor the current state of deep-learning. Current methods rely on\nmultiple-instance learning (MIL) models with frozen feature extractors. Given\nthe the high number of instances in each image, MIL methods have long assumed\nindependence and permutation-invariance of patches, disregarding the tissue\nstructure and correlation between patches. Recent works started studying this\ncorrelation between instances but the computational workload of such a high\nnumber of tokens remained a limiting factor. In particular, relative position\nof patches remains unaddressed. We propose to apply a straightforward encoding\nmodule, namely a RoFormer layer , relying on memory-efficient exact\nself-attention and relative positional encoding. This module can perform full\nself-attention with relative position encoding on patches of large and\narbitrary shaped WSIs, solving the need for correlation between instances and\nspatial modeling of tissues. We demonstrate that our method outperforms\nstate-of-the-art MIL models on three commonly used public datasets (TCGA-NSCLC,\nBRACS and Camelyon16)) on weakly supervised classification tasks. Code is\navailable at https://github.com/Sanofi-Public/DDS-RoFormerMIL\n","authors":["Etienne Pochet","Rami Maroun","Roger Trullo"],"pdf_url":"https://arxiv.org/pdf/2310.01924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03335v2","updated":"2023-10-03T09:40:35Z","published":"2023-09-06T19:30:22Z","title":"SADIR: Shape-Aware Diffusion Models for 3D Image Reconstruction","summary":"  3D image reconstruction from a limited number of 2D images has been a\nlong-standing challenge in computer vision and image analysis. While deep\nlearning-based approaches have achieved impressive performance in this area,\nexisting deep networks often fail to effectively utilize the shape structures\nof objects presented in images. As a result, the topology of reconstructed\nobjects may not be well preserved, leading to the presence of artifacts such as\ndiscontinuities, holes, or mismatched connections between different parts. In\nthis paper, we propose a shape-aware network based on diffusion models for 3D\nimage reconstruction, named SADIR, to address these issues. In contrast to\nprevious methods that primarily rely on spatial correlations of image\nintensities for 3D reconstruction, our model leverages shape priors learned\nfrom the training data to guide the reconstruction process. To achieve this, we\ndevelop a joint learning network that simultaneously learns a mean shape under\ndeformation models. Each reconstructed image is then considered as a deformed\nvariant of the mean shape. We validate our model, SADIR, on both brain and\ncardiac magnetic resonance images (MRIs). Experimental results show that our\nmethod outperforms the baselines with lower reconstruction error and better\npreservation of the shape structure of objects within the images.\n","authors":["Nivetha Jayakumar","Tonmoy Hossain","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03335v2.pdf","comment":"ShapeMI MICCAI 2023: Workshop on Shape in Medical Imaging"},{"id":"http://arxiv.org/abs/2310.01912v1","updated":"2023-10-03T09:35:38Z","published":"2023-10-03T09:35:38Z","title":"Improved Automatic Diabetic Retinopathy Severity Classification Using\n  Deep Multimodal Fusion of UWF-CFP and OCTA Images","summary":"  Diabetic Retinopathy (DR), a prevalent and severe complication of diabetes,\naffects millions of individuals globally, underscoring the need for accurate\nand timely diagnosis. Recent advancements in imaging technologies, such as\nUltra-WideField Color Fundus Photography (UWF-CFP) imaging and Optical\nCoherence Tomography Angiography (OCTA), provide opportunities for the early\ndetection of DR but also pose significant challenges given the disparate nature\nof the data they produce. This study introduces a novel multimodal approach\nthat leverages these imaging modalities to notably enhance DR classification.\nOur approach integrates 2D UWF-CFP images and 3D high-resolution 6x6 mm$^3$\nOCTA (both structure and flow) images using a fusion of ResNet50 and\n3D-ResNet50 models, with Squeeze-and-Excitation (SE) blocks to amplify relevant\nfeatures. Additionally, to increase the model's generalization capabilities, a\nmultimodal extension of Manifold Mixup, applied to concatenated multimodal\nfeatures, is implemented. Experimental results demonstrate a remarkable\nenhancement in DR classification performance with the proposed multimodal\napproach compared to methods relying on a single modality only. The methodology\nlaid out in this work holds substantial promise for facilitating more accurate,\nearly detection of DR, potentially improving clinical outcomes for patients.\n","authors":["Mostafa El Habib Daho","Yihao Li","Rachid Zeghlache","Yapo Cedric Atse","Hugo Le Boité","Sophie Bonnin","Deborah Cosette","Pierre Deman","Laurent Borderie","Capucine Lepicard","Ramin Tadayoni","Béatrice Cochener","Pierre-Henri Conze","Mathieu Lamard","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2310.01912v1.pdf","comment":"Accepted preprint for presentation at MICCAI-OMIA 20023, Vancouver,\n  Canada"},{"id":"http://arxiv.org/abs/2309.14622v2","updated":"2023-10-03T09:32:46Z","published":"2023-09-26T02:21:23Z","title":"Divide and Conquer in Video Anomaly Detection: A Comprehensive Review\n  and New Approach","summary":"  Video anomaly detection is a complex task, and the principle of \"divide and\nconquer\" is often regarded as an effective approach to tackling intricate\nissues. It's noteworthy that recent methods in video anomaly detection have\nrevealed the application of the divide and conquer philosophy (albeit with\ndistinct perspectives from traditional usage), yielding impressive outcomes.\nThis paper systematically reviews these literatures from six dimensions, aiming\nto enhance the use of the divide and conquer strategy in video anomaly\ndetection. Furthermore, based on the insights gained from this review, a novel\napproach is presented, which integrates human skeletal frameworks with video\ndata analysis techniques. This method achieves state-of-the-art performance on\nthe ShanghaiTech dataset, surpassing all existing advanced methods.\n","authors":["Jian Xiao","Tianyuan Liu","Genlin Ji"],"pdf_url":"https://arxiv.org/pdf/2309.14622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01908v1","updated":"2023-10-03T09:30:52Z","published":"2023-10-03T09:30:52Z","title":"Improving style transfer in dynamic contrast enhanced MRI using a\n  spatio-temporal approach","summary":"  Style transfer in DCE-MRI is a challenging task due to large variations in\ncontrast enhancements across different tissues and time. Current unsupervised\nmethods fail due to the wide variety of contrast enhancement and motion between\nthe images in the series. We propose a new method that combines autoencoders to\ndisentangle content and style with convolutional LSTMs to model predicted\nlatent spaces along time and adaptive convolutions to tackle the localised\nnature of contrast enhancement. To evaluate our method, we propose a new metric\nthat takes into account the contrast enhancement. Qualitative and quantitative\nanalyses show that the proposed method outperforms the state of the art on two\ndifferent datasets.\n","authors":["Adam G. Tattersall","Keith A. Goatman","Lucy E. Kershaw","Scott I. K. Semple","Sonia Dahdouh"],"pdf_url":"https://arxiv.org/pdf/2310.01908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01904v1","updated":"2023-10-03T09:22:06Z","published":"2023-10-03T09:22:06Z","title":"Beyond the Benchmark: Detecting Diverse Anomalies in Videos","summary":"  Video Anomaly Detection (VAD) plays a crucial role in modern surveillance\nsystems, aiming to identify various anomalies in real-world situations.\nHowever, current benchmark datasets predominantly emphasize simple,\nsingle-frame anomalies such as novel object detection. This narrow focus\nrestricts the advancement of VAD models. In this research, we advocate for an\nexpansion of VAD investigations to encompass intricate anomalies that extend\nbeyond conventional benchmark boundaries. To facilitate this, we introduce two\ndatasets, HMDB-AD and HMDB-Violence, to challenge models with diverse\naction-based anomalies. These datasets are derived from the HMDB51 action\nrecognition dataset. We further present Multi-Frame Anomaly Detection (MFAD), a\nnovel method built upon the AI-VAD framework. AI-VAD utilizes single-frame\nfeatures such as pose estimation and deep image encoding, and two-frame\nfeatures such as object velocity. They then apply a density estimation\nalgorithm to compute anomaly scores. To address complex multi-frame anomalies,\nwe add a deep video encoding features capturing long-range temporal\ndependencies, and logistic regression to enhance final score calculation.\nExperimental results confirm our assumptions, highlighting existing models\nlimitations with new anomaly types. MFAD excels in both simple and complex\nanomaly detection scenarios.\n","authors":["Yoav Arad","Michael Werman"],"pdf_url":"https://arxiv.org/pdf/2310.01904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01897v1","updated":"2023-10-03T09:12:07Z","published":"2023-10-03T09:12:07Z","title":"MFOS: Model-Free & One-Shot Object Pose Estimation","summary":"  Existing learning-based methods for object pose estimation in RGB images are\nmostly model-specific or category based. They lack the capability to generalize\nto new object categories at test time, hence severely hindering their\npracticability and scalability. Notably, recent attempts have been made to\nsolve this issue, but they still require accurate 3D data of the object surface\nat both train and test time. In this paper, we introduce a novel approach that\ncan estimate in a single forward pass the pose of objects never seen during\ntraining, given minimum input. In contrast to existing state-of-the-art\napproaches, which rely on task-specific modules, our proposed model is entirely\nbased on a transformer architecture, which can benefit from recently proposed\n3D-geometry general pretraining. We conduct extensive experiments and report\nstate-of-the-art one-shot performance on the challenging LINEMOD benchmark.\nFinally, extensive ablations allow us to determine good practices with this\nrelatively new type of architecture in the field.\n","authors":["JongMin Lee","Yohann Cabon","Romain Brégier","Sungjoo Yoo","Jerome Revaud"],"pdf_url":"https://arxiv.org/pdf/2310.01897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15289v2","updated":"2023-10-03T08:52:42Z","published":"2023-09-26T21:56:03Z","title":"SEPT: Towards Efficient Scene Representation Learning for Motion\n  Prediction","summary":"  Motion prediction is crucial for autonomous vehicles to operate safely in\ncomplex traffic environments. Extracting effective spatiotemporal relationships\namong traffic elements is key to accurate forecasting. Inspired by the\nsuccessful practice of pretrained large language models, this paper presents\nSEPT, a modeling framework that leverages self-supervised learning to develop\npowerful spatiotemporal understanding for complex traffic scenes. Specifically,\nour approach involves three masking-reconstruction modeling tasks on scene\ninputs including agents' trajectories and road network, pretraining the scene\nencoder to capture kinematics within trajectory, spatial structure of road\nnetwork, and interactions among roads and agents. The pretrained encoder is\nthen finetuned on the downstream forecasting task. Extensive experiments\ndemonstrate that SEPT, without elaborate architectural design or manual feature\nengineering, achieves state-of-the-art performance on the Argoverse 1 and\nArgoverse 2 motion forecasting benchmarks, outperforming previous methods on\nall main metrics by a large margin.\n","authors":["Zhiqian Lan","Yuxuan Jiang","Yao Mu","Chen Chen","Shengbo Eben Li","Hang Zhao","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2309.15289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16499v2","updated":"2023-10-03T08:49:58Z","published":"2023-09-26T23:55:39Z","title":"Cross-City Matters: A Multimodal Remote Sensing Benchmark Dataset for\n  Cross-City Semantic Segmentation using High-Resolution Domain Adaptation\n  Networks","summary":"  Artificial intelligence (AI) approaches nowadays have gained remarkable\nsuccess in single-modality-dominated remote sensing (RS) applications,\nespecially with an emphasis on individual urban environments (e.g., single\ncities or regions). Yet these AI models tend to meet the performance bottleneck\nin the case studies across cities or regions, due to the lack of diverse RS\ninformation and cutting-edge solutions with high generalization ability. To\nthis end, we build a new set of multimodal remote sensing benchmark datasets\n(including hyperspectral, multispectral, SAR) for the study purpose of the\ncross-city semantic segmentation task (called C2Seg dataset), which consists of\ntwo cross-city scenes, i.e., Berlin-Augsburg (in Germany) and Beijing-Wuhan (in\nChina). Beyond the single city, we propose a high-resolution domain adaptation\nnetwork, HighDAN for short, to promote the AI model's generalization ability\nfrom the multi-city environments. HighDAN is capable of retaining the spatially\ntopological structure of the studied urban scene well in a parallel high-to-low\nresolution fusion fashion but also closing the gap derived from enormous\ndifferences of RS image representations between different cities by means of\nadversarial learning. In addition, the Dice loss is considered in HighDAN to\nalleviate the class imbalance issue caused by factors across cities. Extensive\nexperiments conducted on the C2Seg dataset show the superiority of our HighDAN\nin terms of segmentation performance and generalization ability, compared to\nstate-of-the-art competitors. The C2Seg dataset and the semantic segmentation\ntoolbox (involving the proposed HighDAN) will be available publicly at\nhttps://github.com/danfenghong.\n","authors":["Danfeng Hong","Bing Zhang","Hao Li","Yuxuan Li","Jing Yao","Chenyu Li","Martin Werner","Jocelyn Chanussot","Alexander Zipf","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.16499v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01886v1","updated":"2023-10-03T08:39:33Z","published":"2023-10-03T08:39:33Z","title":"Effective and Parameter-Efficient Reusing Fine-Tuned Models","summary":"  Many pre-trained large-scale models provided online have become highly\neffective in transferring to downstream tasks. At the same time, various\ntask-specific models fine-tuned on these pre-trained models are available\nonline for public use. In practice, as collecting task-specific data is\nlabor-intensive and fine-tuning the large pre-trained models is computationally\nexpensive, one can reuse task-specific finetuned models to deal with downstream\ntasks. However, using a model per task causes a heavy burden on storage and\nserving. Recently, many training-free and parameter-efficient methods have been\nproposed for reusing multiple fine-tuned task-specific models into a single\nmulti-task model. However, these methods exhibit a large accuracy gap compared\nwith using a fine-tuned model per task. In this paper, we propose\nParameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing\nFully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task\nvector into a merged model by magnitude pruning. For reusing LoRA fine-tuned\nmodels, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA\nmatrix by singular value decomposition. Both PERUFFT and PERU-LoRA are\ntraining-free. Extensive experiments conducted on computer vision and natural\nlanguage process tasks demonstrate the effectiveness and parameter-efficiency\nof the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform\nexisting reusing model methods by a large margin and achieve comparable\nperformance to using a fine-tuned model per task.\n","authors":["Weisen Jiang","Baijiong Lin","Han Shi","Yu Zhang","and Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.01886v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2310.01405v2","updated":"2023-10-03T08:39:09Z","published":"2023-10-02T17:59:07Z","title":"Representation Engineering: A Top-Down Approach to AI Transparency","summary":"  In this paper, we identify and characterize the emerging area of\nrepresentation engineering (RepE), an approach to enhancing the transparency of\nAI systems that draws on insights from cognitive neuroscience. RepE places\npopulation-level representations, rather than neurons or circuits, at the\ncenter of analysis, equipping us with novel methods for monitoring and\nmanipulating high-level cognitive phenomena in deep neural networks (DNNs). We\nprovide baselines and an initial analysis of RepE techniques, showing that they\noffer simple yet effective solutions for improving our understanding and\ncontrol of large language models. We showcase how these methods can provide\ntraction on a wide range of safety-relevant problems, including honesty,\nharmlessness, power-seeking, and more, demonstrating the promise of top-down\ntransparency research. We hope that this work catalyzes further exploration of\nRepE and fosters advancements in the transparency and safety of AI systems.\n","authors":["Andy Zou","Long Phan","Sarah Chen","James Campbell","Phillip Guo","Richard Ren","Alexander Pan","Xuwang Yin","Mantas Mazeika","Ann-Kathrin Dombrowski","Shashwat Goel","Nathaniel Li","Michael J. Byun","Zifan Wang","Alex Mallen","Steven Basart","Sanmi Koyejo","Dawn Song","Matt Fredrikson","J. Zico Kolter","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2310.01405v2.pdf","comment":"Code is available at\n  https://github.com/andyzoujm/representation-engineering"},{"id":"http://arxiv.org/abs/2310.01881v1","updated":"2023-10-03T08:34:49Z","published":"2023-10-03T08:34:49Z","title":"Adaptive Multi-NeRF: Exploit Efficient Parallelism in Adaptive Multiple\n  Scale Neural Radiance Field Rendering","summary":"  Recent advances in Neural Radiance Fields (NeRF) have demonstrated\nsignificant potential for representing 3D scene appearances as implicit neural\nnetworks, enabling the synthesis of high-fidelity novel views. However, the\nlengthy training and rendering process hinders the widespread adoption of this\npromising technique for real-time rendering applications. To address this\nissue, we present an effective adaptive multi-NeRF method designed to\naccelerate the neural rendering process for large scenes with unbalanced\nworkloads due to varying scene complexities.\n  Our method adaptively subdivides scenes into axis-aligned bounding boxes\nusing a tree hierarchy approach, assigning smaller NeRFs to different-sized\nsubspaces based on the complexity of each scene portion. This ensures the\nunderlying neural representation is specific to a particular part of the scene.\nWe optimize scene subdivision by employing a guidance density grid, which\nbalances representation capability for each Multilayer Perceptron (MLP).\nConsequently, samples generated by each ray can be sorted and collected for\nparallel inference, achieving a balanced workload suitable for small MLPs with\nconsistent dimensions for regular and GPU-friendly computations. We aosl\ndemonstrated an efficient NeRF sampling strategy that intrinsically adapts to\nincrease parallelism, utilization, and reduce kernel calls, thereby achieving\nmuch higher GPU utilization and accelerating the rendering process.\n","authors":["Tong Wang","Shuichi Kurabayashi"],"pdf_url":"https://arxiv.org/pdf/2310.01881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01876v1","updated":"2023-10-03T08:26:27Z","published":"2023-10-03T08:26:27Z","title":"A Dual Attentive Generative Adversarial Network for Remote Sensing Image\n  Change Detection","summary":"  Remote sensing change detection between bi-temporal images receives growing\nconcentration from researchers. However, comparing two bi-temporal images for\ndetecting changes is challenging, as they demonstrate different appearances. In\nthis paper, we propose a dual attentive generative adversarial network for\nachieving very high-resolution remote sensing image change detection tasks,\nwhich regards the detection model as a generator and attains the optimal\nweights of the detection model without increasing the parameters of the\ndetection model through generative-adversarial strategy, boosting the spatial\ncontiguity of predictions. Moreover, We design a multi-level feature extractor\nfor effectively fusing multi-level features, which adopts the pre-trained model\nto extract multi-level features from bi-temporal images and introduces\naggregate connections to fuse them. To strengthen the identification of\nmulti-scale objects, we propose a multi-scale adaptive fusion module to\nadaptively fuse multi-scale features through various receptive fields and\ndesign a context refinement module to explore contextual dependencies.\nMoreover, the DAGAN framework utilizes the 4-layer convolution network as a\ndiscriminator to identify whether the synthetic image is fake or real.\nExtensive experiments represent that the DAGAN framework has better performance\nwith 85.01% mean IoU and 91.48% mean F1 score than advanced methods on the\nLEVIR dataset.\n","authors":["Luyi Qiu","Xiaofeng Zhang","ChaoChen Gu","and ShanYing Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.01876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00481v4","updated":"2023-10-03T08:02:02Z","published":"2023-07-02T05:48:19Z","title":"Seeing is not Believing: An Identity Hider for Human Vision Privacy\n  Protection","summary":"  Massive captured face images are stored in the database for the\nidentification of individuals. However, these images can be observed\nintentionally or unintentionally by data managers, which is not at the will of\nindividuals and may cause privacy violations. Existing protection schemes can\nmaintain identifiability but slightly change the facial appearance, rendering\nit still susceptible to the visual perception of the original identity by data\nmanagers. In this paper, we propose an effective identity hider for human\nvision protection, which can significantly change appearance to visually hide\nidentity while allowing identification for face recognizers. Concretely, the\nidentity hider benefits from two specially designed modules: 1) The virtual\nface generation module generates a virtual face with a new appearance by\nmanipulating the latent space of StyleGAN2. In particular, the virtual face has\na similar parsing map to the original face, supporting other vision tasks such\nas head pose detection. 2) The appearance transfer module transfers the\nappearance of the virtual face into the original face via attribute\nreplacement. Meanwhile, identity information can be preserved well with the\nhelp of the disentanglement networks. In addition, diversity and background\npreservation are supported to meet the various requirements. Extensive\nexperiments demonstrate that the proposed identity hider achieves excellent\nperformance on privacy protection and identifiability preservation.\n","authors":["Tao Wang","Yushu Zhang","Zixuan Yang","Hua Zhang","Zhongyun Hua"],"pdf_url":"https://arxiv.org/pdf/2307.00481v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01861v1","updated":"2023-10-03T07:50:32Z","published":"2023-10-03T07:50:32Z","title":"Shifting More Attention to Breast Lesion Segmentation in Ultrasound\n  Videos","summary":"  Breast lesion segmentation in ultrasound (US) videos is essential for\ndiagnosing and treating axillary lymph node metastasis. However, the lack of a\nwell-established and large-scale ultrasound video dataset with high-quality\nannotations has posed a persistent challenge for the research community. To\novercome this issue, we meticulously curated a US video breast lesion\nsegmentation dataset comprising 572 videos and 34,300 annotated frames,\ncovering a wide range of realistic clinical scenarios. Furthermore, we propose\na novel frequency and localization feature aggregation network (FLA-Net) that\nlearns temporal features from the frequency domain and predicts additional\nlesion location positions to assist with breast lesion segmentation. We also\ndevise a localization-based contrastive loss to reduce the lesion location\ndistance between neighboring video frames within the same video and enlarge the\nlocation distances between frames from different ultrasound videos. Our\nexperiments on our annotated dataset and two public video polyp segmentation\ndatasets demonstrate that our proposed FLA-Net achieves state-of-the-art\nperformance in breast lesion segmentation in US videos and video polyp\nsegmentation while significantly reducing time and space complexity. Our model\nand dataset are available at https://github.com/jhl-Det/FLA-Net.\n","authors":["Junhao Lin","Qian Dai","Lei Zhu","Huazhu Fu","Qiong Wang","Weibin Li","Wenhao Rao","Xiaoyang Huang","Liansheng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01861v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2306.05272v3","updated":"2023-10-03T07:37:54Z","published":"2023-06-08T15:20:27Z","title":"Image Clustering via the Principle of Rate Reduction in the Age of\n  Pretrained Models","summary":"  The advent of large pre-trained models has brought about a paradigm shift in\nboth visual representation learning and natural language processing. However,\nclustering unlabeled images, as a fundamental and classic machine learning\nproblem, still lacks an effective solution, particularly for large-scale\ndatasets. In this paper, we propose a novel image clustering pipeline that\nleverages the powerful feature representation of large pre-trained models such\nas CLIP and cluster images effectively and efficiently at scale. We first\ndeveloped a novel algorithm to estimate the number of clusters in a given\ndataset. We then show that the pre-trained features are significantly more\nstructured by further optimizing the rate reduction objective. The resulting\nfeatures may significantly improve the clustering accuracy, e.g., from 57\\% to\n66\\% on ImageNet-1k. Furthermore, by leveraging CLIP's multimodality bridge\nbetween image and text, we develop a simple yet effective self-labeling\nalgorithm that produces meaningful text labels for the clusters. Through\nextensive experiments, we show that our pipeline works well on standard\ndatasets such as CIFAR-10, CIFAR-100, and ImageNet-1k. It also extends to\ndatasets without predefined labels, such as LAION-Aesthetics and WikiArts. We\nreleased the code in https://github.com/LeslieTrue/CPP.\n","authors":["Tianzhe Chu","Shengbang Tong","Tianjiao Ding","Xili Dai","Benjamin David Haeffele","René Vidal","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2306.05272v3.pdf","comment":"23 pages, 14 figures"},{"id":"http://arxiv.org/abs/2310.01852v1","updated":"2023-10-03T07:33:27Z","published":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by\n  Language-based Semantic Alignment","summary":"  The video-language (VL) pretraining has achieved remarkable improvement in\nmultiple downstream tasks. However, the current VL pretraining framework is\nhard to extend to multiple modalities (N modalities, N>=3) beyond vision and\nlanguage. We thus propose LanguageBind, taking the language as the bind across\ndifferent modalities because the language modality is well-explored and\ncontains rich semantics. Specifically, we freeze the language encoder acquired\nby VL pretraining, then train encoders for other modalities with contrastive\nlearning. As a result, all modalities are mapped to a shared feature space,\nimplementing multi-modal semantic alignment. While LanguageBind ensures that we\ncan extend VL modalities to N modalities, we also need a high-quality dataset\nwith alignment data pairs centered on language. We thus propose VIDAL-10M with\nVideo, Infrared, Depth, Audio and their corresponding Language, naming as\nVIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with\ncomplete semantics rather than truncated segments from long videos, and all the\nvideo, depth, infrared, and audio modalities are aligned to their textual\ndescriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 1.2%\nR@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot\nvideo-text retrieval, validating the high quality of our dataset. Beyond this,\nour LanguageBind has achieved great improvement in the zero-shot video, audio,\ndepth, and infrared understanding tasks. For instance, on the LLVIP and NYU-D\ndatasets, LanguageBind outperforms ImageBind-huge with 23.8% and 11.1% top-1\naccuracy.\n","authors":["Bin Zhu","Bin Lin","Munan Ning","Yang Yan","Jiaxi Cui","Wang HongFa","Yatian Pang","Wenhao Jiang","Junwu Zhang","Zongwei Li","Cai Wan Zhang","Zhifeng Li","Wei Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01852v1.pdf","comment":"Under review as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.01845v1","updated":"2023-10-03T07:19:59Z","published":"2023-10-03T07:19:59Z","title":"Zero-Shot Refinement of Buildings' Segmentation Models using SAM","summary":"  Foundation models have excelled in various tasks but are often evaluated on\ngeneral benchmarks. The adaptation of these models for specific domains, such\nas remote sensing imagery, remains an underexplored area. In remote sensing,\nprecise building instance segmentation is vital for applications like urban\nplanning. While Convolutional Neural Networks (CNNs) perform well, their\ngeneralization can be limited. For this aim, we present a novel approach to\nadapt foundation models to address existing models' generalization dropback.\nAmong several models, our focus centers on the Segment Anything Model (SAM), a\npotent foundation model renowned for its prowess in class-agnostic image\nsegmentation capabilities. We start by identifying the limitations of SAM,\nrevealing its suboptimal performance when applied to remote sensing imagery.\nMoreover, SAM does not offer recognition abilities and thus fails to classify\nand tag localized objects. To address these limitations, we introduce different\nprompting strategies, including integrating a pre-trained CNN as a prompt\ngenerator. This novel approach augments SAM with recognition abilities, a first\nof its kind. We evaluated our method on three remote sensing datasets,\nincluding the WHU Buildings dataset, the Massachusetts Buildings dataset, and\nthe AICrowd Mapping Challenge. For out-of-distribution performance on the WHU\ndataset, we achieve a 5.47% increase in IoU and a 4.81% improvement in\nF1-score. For in-distribution performance on the WHU dataset, we observe a\n2.72% and 1.58% increase in True-Positive-IoU and True-Positive-F1 score,\nrespectively. We intend to release our code repository, hoping to inspire\nfurther exploration of foundation models for domain-specific tasks within the\nremote sensing community.\n","authors":["Ali Mayladan","Hasan Nasrallah","Hasan Moughnieh","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01843v1","updated":"2023-10-03T07:17:58Z","published":"2023-10-03T07:17:58Z","title":"Selective Feature Adapter for Dense Vision Transformers","summary":"  Fine-tuning pre-trained transformer models, e.g., Swin Transformer, are\nsuccessful in numerous downstream for dense prediction vision tasks. However,\none major issue is the cost/storage of their huge amount of parameters, which\nbecomes increasingly challenging to handle with the growing amount of vision\ntasks. In this paper, we propose an effective approach to alleviate the issue,\nnamely selective feature adapter (SFA). It achieves state-of-the-art (SoTA)\nperformance under any given budget of trainable parameters, and demonstrates\ncomparable or better performance than fully fine-tuned models across various\ndense tasks. Specifically, SFA consists of external adapters and internal\nadapters which are sequentially operated over a transformer model. For external\nadapters, we properly select the places and amount of additional multilayer\nperception (MLP). For internal adapters, we transform a few task-important\nparameters inside the transformer, which are automatically discovered through a\nsimple yet effective lottery ticket algorithm. Our experiments show that the\ndual adapter module, a.k.a SFA, is essential to achieve the best trade-off on\ndense vision tasks, such as segmentation, detection and depth-estimation,\noutperforming other adapters with a single module.\n","authors":["Xueqing Deng","Qi Fan","Xiaojie Jin","Linjie Yang","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01842v1","updated":"2023-10-03T07:14:53Z","published":"2023-10-03T07:14:53Z","title":"SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-based\n  Question Answering","summary":"  The intersection of vision and language is of major interest due to the\nincreased focus on seamless integration between recognition and reasoning.\nScene graphs (SGs) have emerged as a useful tool for multimodal image analysis,\nshowing impressive performance in tasks such as Visual Question Answering\n(VQA). In this work, we demonstrate that despite the effectiveness of scene\ngraphs in VQA tasks, current methods that utilize idealized annotated scene\ngraphs struggle to generalize when using predicted scene graphs extracted from\nimages. To address this issue, we introduce the SelfGraphVQA framework. Our\napproach extracts a scene graph from an input image using a pre-trained scene\ngraph generator and employs semantically-preserving augmentation with\nself-supervised techniques. This method improves the utilization of graph\nrepresentations in VQA tasks by circumventing the need for costly and\npotentially biased annotated data. By creating alternative views of the\nextracted graphs through image augmentations, we can learn joint embeddings by\noptimizing the informational content in their representations using an\nun-normalized contrastive approach. As we work with SGs, we experiment with\nthree distinct maximization strategies: node-wise, graph-wise, and\npermutation-equivariant regularization. We empirically showcase the\neffectiveness of the extracted scene graph for VQA and demonstrate that these\napproaches enhance overall performance by highlighting the significance of\nvisual information. This offers a more practical solution for VQA tasks that\nrely on SGs for complex reasoning questions.\n","authors":["Bruno Souza","Marius Aasan","Helio Pedrini","Adín Ramírez Rivera"],"pdf_url":"https://arxiv.org/pdf/2310.01842v1.pdf","comment":"To appear in Vision-and-Language Algorithmic Reasoning Workshop at\n  ICCV 2023"},{"id":"http://arxiv.org/abs/2310.01840v1","updated":"2023-10-03T07:10:49Z","published":"2023-10-03T07:10:49Z","title":"Self-Supervised High Dynamic Range Imaging with Multi-Exposure Images in\n  Dynamic Scenes","summary":"  Merging multi-exposure images is a common approach for obtaining high dynamic\nrange (HDR) images, with the primary challenge being the avoidance of ghosting\nartifacts in dynamic scenes. Recent methods have proposed using deep neural\nnetworks for deghosting. However, the methods typically rely on sufficient data\nwith HDR ground-truths, which are difficult and costly to collect. In this\nwork, to eliminate the need for labeled data, we propose SelfHDR, a\nself-supervised HDR reconstruction method that only requires dynamic\nmulti-exposure images during training. Specifically, SelfHDR learns a\nreconstruction network under the supervision of two complementary components,\nwhich can be constructed from multi-exposure images and focus on HDR color as\nwell as structure, respectively. The color component is estimated from aligned\nmulti-exposure images, while the structure one is generated through a\nstructure-focused network that is supervised by the color component and an\ninput reference (\\eg, medium-exposure) image. During testing, the learned\nreconstruction network is directly deployed to predict an HDR image.\nExperiments on real-world images demonstrate our SelfHDR achieves superior\nresults against the state-of-the-art self-supervised methods, and comparable\nperformance to supervised ones. Codes are available at\nhttps://github.com/cszhilu1998/SelfHDR\n","authors":["Zhilu Zhang","Haoyu Wang","Shuai Liu","Xiaotao Wang","Lei Lei","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2310.01840v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2309.00410v2","updated":"2023-10-03T07:05:03Z","published":"2023-09-01T12:07:40Z","title":"Selective Scene Text Removal","summary":"  Scene text removal (STR) is the image transformation task to remove text\nregions in scene images. The conventional STR methods remove all scene text.\nThis means that the existing methods cannot select text to be removed. In this\npaper, we propose a novel task setting named selective scene text removal\n(SSTR) that removes only target words specified by the user. Although SSTR is a\nmore complex task than STR, the proposed multi-module structure enables\nefficient training for SSTR. Experimental results show that the proposed method\ncan remove target words as expected.\n","authors":["Hayato Mitani","Akisato Kimura","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2309.00410v2.pdf","comment":"12 pages, 8 figures, Accepted at the 34th British Machine Vision\n  Conference, code:https://github.com/mitanihayato/Selective-Scene-Text-Removal"},{"id":"http://arxiv.org/abs/2310.01837v1","updated":"2023-10-03T07:01:23Z","published":"2023-10-03T07:01:23Z","title":"Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation","summary":"  Current AI-based methods do not provide comprehensible physical\ninterpretations of the utilized data, extracted features, and\npredictions/inference operations. As a result, deep learning models trained\nusing high-resolution satellite imagery lack transparency and explainability\nand can be merely seen as a black box, which limits their wide-level adoption.\nExperts need help understanding the complex behavior of AI models and the\nunderlying decision-making process. The explainable artificial intelligence\n(XAI) field is an emerging field providing means for robust, practical, and\ntrustworthy deployment of AI models. Several XAI techniques have been proposed\nfor image classification tasks, whereas the interpretation of image\nsegmentation remains largely unexplored. This paper offers to bridge this gap\nby adapting the recent XAI classification algorithms and making them usable for\nmuti-class image segmentation, where we mainly focus on buildings' segmentation\nfrom high-resolution satellite images. To benchmark and compare the performance\nof the proposed approaches, we introduce a new XAI evaluation methodology and\nmetric based on \"Entropy\" to measure the model uncertainty. Conventional XAI\nevaluation methods rely mainly on feeding area-of-interest regions from the\nimage back to the pre-trained (utility) model and then calculating the average\nchange in the probability of the target class. Those evaluation metrics lack\nthe needed robustness, and we show that using Entropy to monitor the model\nuncertainty in segmenting the pixels within the target class is more suitable.\nWe hope this work will pave the way for additional XAI research for image\nsegmentation and applications in the remote sensing discipline.\n","authors":["Abdul Karim Gizzini","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01833v1","updated":"2023-10-03T06:56:07Z","published":"2023-10-03T06:56:07Z","title":"Skin the sheep not only once: Reusing Various Depth Datasets to Drive\n  the Learning of Optical Flow","summary":"  Optical flow estimation is crucial for various applications in vision and\nrobotics. As the difficulty of collecting ground truth optical flow in\nreal-world scenarios, most of the existing methods of learning optical flow\nstill adopt synthetic dataset for supervised training or utilize photometric\nconsistency across temporally adjacent video frames to drive the unsupervised\nlearning, where the former typically has issues of generalizability while the\nlatter usually performs worse than the supervised ones. To tackle such\nchallenges, we propose to leverage the geometric connection between optical\nflow estimation and stereo matching (based on the similarity upon finding pixel\ncorrespondences across images) to unify various real-world depth estimation\ndatasets for generating supervised training data upon optical flow.\nSpecifically, we turn the monocular depth datasets into stereo ones via\nsynthesizing virtual disparity, thus leading to the flows along the horizontal\ndirection; moreover, we introduce virtual camera motion into stereo data to\nproduce additional flows along the vertical direction. Furthermore, we propose\napplying geometric augmentations on one image of an optical flow pair,\nencouraging the optical flow estimator to learn from more challenging cases.\nLastly, as the optical flow maps under different geometric augmentations\nactually exhibit distinct characteristics, an auxiliary classifier which trains\nto identify the type of augmentation from the appearance of the flow map is\nutilized to further enhance the learning of the optical flow estimator. Our\nproposed method is general and is not tied to any particular flow estimator,\nwhere extensive experiments based on various datasets and optical flow\nestimation models verify its efficacy and superiority.\n","authors":["Sheng-Chi Huang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2310.01833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01830v1","updated":"2023-10-03T06:55:19Z","published":"2023-10-03T06:55:19Z","title":"AI-Generated Images as Data Source: The Dawn of Synthetic Era","summary":"  The advancement of visual intelligence is intrinsically tethered to the\navailability of data. In parallel, generative Artificial Intelligence (AI) has\nunlocked the potential to create synthetic images that closely resemble\nreal-world photographs, which prompts a compelling inquiry: how visual\nintelligence benefit from the advance of generative AI? This paper explores the\ninnovative concept of harnessing these AI-generated images as a new data\nsource, reshaping traditional model paradigms in visual intelligence. In\ncontrast to real data, AI-generated data sources exhibit remarkable advantages,\nincluding unmatched abundance and scalability, the rapid generation of vast\ndatasets, and the effortless simulation of edge cases. Built on the success of\ngenerative AI models, we examines the potential of their generated data in a\nrange of applications, from training machine learning models to simulating\nscenarios for computational modelling, testing, and validation. We probe the\ntechnological foundations that support this groundbreaking use of generative\nAI, engaging in an in-depth discussion on the ethical, legal, and practical\nconsiderations that accompany this transformative paradigm shift. Through an\nexhaustive survey of current technologies and applications, this paper presents\na comprehensive view of the synthetic era in visual intelligence. A project\nwith this paper can be found at https://github.com/mwxely/AIGS .\n","authors":["Zuhao Yang","Fangneng Zhan","Kunhao Liu","Muyu Xu","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2310.01830v1.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.01828v1","updated":"2023-10-03T06:51:48Z","published":"2023-10-03T06:51:48Z","title":"Trainable Noise Model as an XAI evaluation method: application on Sobol\n  for remote sensing image segmentation","summary":"  eXplainable Artificial Intelligence (XAI) has emerged as an essential\nrequirement when dealing with mission-critical applications, ensuring\ntransparency and interpretability of the employed black box AI models. The\nsignificance of XAI spans various domains, from healthcare to finance, where\nunderstanding the decision-making process of deep learning algorithms is\nessential. Most AI-based computer vision models are often black boxes; hence,\nproviding explainability of deep neural networks in image processing is crucial\nfor their wide adoption and deployment in medical image analysis, autonomous\ndriving, and remote sensing applications. Recently, several XAI methods for\nimage classification tasks have been introduced. On the contrary, image\nsegmentation has received comparatively less attention in the context of\nexplainability, although it is a fundamental task in computer vision\napplications, especially in remote sensing. Only some research proposes\ngradient-based XAI algorithms for image segmentation. This paper adapts the\nrecent gradient-free Sobol XAI method for semantic segmentation. To measure the\nperformance of the Sobol method for segmentation, we propose a quantitative XAI\nevaluation method based on a learnable noise model. The main objective of this\nmodel is to induce noise on the explanation maps, where higher induced noise\nsignifies low accuracy and vice versa. A benchmark analysis is conducted to\nevaluate and compare performance of three XAI methods, including Seg-Grad-CAM,\nSeg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation\ntechnique. This constitutes the first attempt to run and evaluate XAI methods\nusing high-resolution satellite images.\n","authors":["Hossein Shreim","Abdul Karim Gizzini","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01825v1","updated":"2023-10-03T06:42:28Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":"  Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01821v1","updated":"2023-10-03T06:33:05Z","published":"2023-10-03T06:33:05Z","title":"MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural\n  Radiance Fields","summary":"  Neural radiance fields (NeRFs) have shown impressive results for novel view\nsynthesis. However, they depend on the repetitive use of a single-input\nsingle-output multilayer perceptron (SISO MLP) that maps 3D coordinates and\nview direction to the color and volume density in a sample-wise manner, which\nslows the rendering. We propose a multi-input multi-output NeRF (MIMO-NeRF)\nthat reduces the number of MLPs running by replacing the SISO MLP with a MIMO\nMLP and conducting mappings in a group-wise manner. One notable challenge with\nthis approach is that the color and volume density of each point can differ\naccording to a choice of input coordinates in a group, which can lead to some\nnotable ambiguity. We also propose a self-supervised learning method that\nregularizes the MIMO MLP with multiple fast reformulated MLPs to alleviate this\nambiguity without using pretrained models. The results of a comprehensive\nexperimental evaluation including comparative and ablation studies are\npresented to show that MIMO-NeRF obtains a good trade-off between speed and\nquality with a reasonable training time. We then demonstrate that MIMO-NeRF is\ncompatible with and complementary to previous advancements in NeRFs by applying\nit to two representative fast NeRFs, i.e., a NeRF with sample reduction\n(DONeRF) and a NeRF with alternative representations (TensoRF).\n","authors":["Takuhiro Kaneko"],"pdf_url":"https://arxiv.org/pdf/2310.01821v1.pdf","comment":"Accepted to ICCV 2023. Project page:\n  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/mimo-nerf/"},{"id":"http://arxiv.org/abs/2308.14847v3","updated":"2023-10-03T06:30:34Z","published":"2023-08-28T19:08:17Z","title":"NSF: Neural Surface Fields for Human Modeling from Monocular Depth","summary":"  Obtaining personalized 3D animatable avatars from a monocular camera has\nseveral real world applications in gaming, virtual try-on, animation, and\nVR/XR, etc. However, it is very challenging to model dynamic and fine-grained\nclothing deformations from such sparse data. Existing methods for modeling 3D\nhumans from depth data have limitations in terms of computational efficiency,\nmesh coherency, and flexibility in resolution and topology. For instance,\nreconstructing shapes using implicit functions and extracting explicit meshes\nper frame is computationally expensive and cannot ensure coherent meshes across\nframes. Moreover, predicting per-vertex deformations on a pre-designed human\ntemplate with a discrete surface lacks flexibility in resolution and topology.\nTo overcome these limitations, we propose a novel method Neural Surface Fields\nfor modeling 3D clothed humans from monocular depth. NSF defines a neural field\nsolely on the base surface which models a continuous and flexible displacement\nfield. NSF can be adapted to the base surface with different resolution and\ntopology without retraining at inference time. Compared to existing approaches,\nour method eliminates the expensive per-frame surface extraction while\nmaintaining mesh coherency, and is capable of reconstructing meshes with\narbitrary resolution without retraining. To foster research in this direction,\nwe release our code in project page at: https://yuxuan-xue.com/nsf.\n","authors":["Yuxuan Xue","Bharat Lal Bhatnagar","Riccardo Marin","Nikolaos Sarafianos","Yuanlu Xu","Gerard Pons-Moll","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2308.14847v3.pdf","comment":"Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf"},{"id":"http://arxiv.org/abs/2310.01819v1","updated":"2023-10-03T06:16:38Z","published":"2023-10-03T06:16:38Z","title":"Amazing Combinatorial Creation: Acceptable Swap-Sampling for\n  Text-to-Image Generation","summary":"  Exploring a machine learning system to generate meaningful combinatorial\nobject images from multiple textual descriptions, emulating human creativity,\nis a significant challenge as humans are able to construct amazing\ncombinatorial objects, but machines strive to emulate data distribution. In\nthis paper, we develop a straightforward yet highly effective technique called\nacceptable swap-sampling to generate a combinatorial object image that exhibits\nnovelty and surprise, utilizing text concepts of different objects. Initially,\nwe propose a swapping mechanism that constructs a novel embedding by exchanging\ncolumn vectors of two text embeddings for generating a new combinatorial image\nthrough a cutting-edge diffusion model. Furthermore, we design an acceptable\nregion by managing suitable CLIP distances between the new image and the\noriginal concept generations, increasing the likelihood of accepting the new\nimage with a high-quality combination. This region allows us to efficiently\nsample a small subset from a new image pool generated by using randomly\nexchanging column vectors. Lastly, we employ a segmentation method to compare\nCLIP distances among the segmented components, ultimately selecting the most\npromising object image from the sampled subset. Our experiments focus on text\npairs of objects from ImageNet, and our results demonstrate that our approach\noutperforms recent methods such as Stable-Diffusion2, DALLE2, ERNIE-ViLG2 and\nBing in generating novel and surprising object images, even when the associated\nconcepts appear to be implausible, such as lionfish-abacus. Furthermore, during\nthe sampling process, our approach without training and human preference is\nalso comparable to PickScore and HPSv2 trained using human preference datasets.\n","authors":["Jun Li","Zedong Zhang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2310.01819v1.pdf","comment":"Project page: \\url{https://asst2i.github.io/anon/}"},{"id":"http://arxiv.org/abs/2305.18030v2","updated":"2023-10-03T06:11:48Z","published":"2023-05-25T19:41:40Z","title":"Automated Search-Space Generation Neural Architecture Search","summary":"  To search an optimal sub-network within a general deep neural network (DNN),\nexisting neural architecture search (NAS) methods typically rely on\nhandcrafting a search space beforehand. Such requirements make it challenging\nto extend them onto general scenarios without significant human expertise and\nmanual intervention. To overcome the limitations, we propose Automated\nSearch-Space Generation Neural Architecture Search (ASGNAS), perhaps the first\nautomated system to train general DNNs that cover all candidate connections and\noperations and produce high-performing sub-networks in the one shot manner.\nTechnologically, ASGNAS delivers three noticeable contributions to minimize\nhuman efforts: (i) automated search space generation for general DNNs; (ii) a\nHierarchical Half-Space Projected Gradient (H2SPG) that leverages the hierarchy\nand dependency within generated search space to ensure the network validity\nduring optimization, and reliably produces a solution with both high\nperformance and hierarchical group sparsity; and (iii) automated sub-network\nconstruction upon the H2SPG solution. Numerically, we demonstrate the\neffectiveness of ASGNAS on a variety of general DNNs, including RegNet,\nStackedUnets, SuperResNet, and DARTS, over benchmark datasets such as CIFAR10,\nFashion-MNIST, ImageNet, STL-10 , and SVNH. The sub-networks computed by ASGNAS\nachieve competitive even superior performance compared to the starting full\nDNNs and other state-of-the-arts. The library will be released at\nhttps://github.com/tianyic/only_train_once.\n","authors":["Tianyi Chen","Luming Liang","Tianyu Ding","Ilya Zharkov"],"pdf_url":"https://arxiv.org/pdf/2305.18030v2.pdf","comment":"Graph visualization for DARTS, SuperResNet are omitted for arXiv\n  version due to exceeding page dimension limit. Please refer to the\n  open-review version for taking the visualizations"},{"id":"http://arxiv.org/abs/2310.01812v1","updated":"2023-10-03T05:55:11Z","published":"2023-10-03T05:55:11Z","title":"PPT: Token Pruning and Pooling for Efficient Vision Transformers","summary":"  Vision Transformers (ViTs) have emerged as powerful models in the field of\ncomputer vision, delivering superior performance across various vision tasks.\nHowever, the high computational complexity poses a significant barrier to their\npractical applications in real-world scenarios. Motivated by the fact that not\nall tokens contribute equally to the final predictions and fewer tokens bring\nless computational cost, reducing redundant tokens has become a prevailing\nparadigm for accelerating vision transformers. However, we argue that it is not\noptimal to either only reduce inattentive redundancy by token pruning, or only\nreduce duplicative redundancy by token merging. To this end, in this paper we\npropose a novel acceleration framework, namely token Pruning & Pooling\nTransformers (PPT), to adaptively tackle these two types of redundancy in\ndifferent layers. By heuristically integrating both token pruning and token\npooling techniques in ViTs without additional trainable parameters, PPT\neffectively reduces the model complexity while maintaining its predictive\naccuracy. For example, PPT reduces over 37% FLOPs and improves the throughput\nby over 45% for DeiT-S without any accuracy drop on the ImageNet dataset.\n","authors":["Xinjian Wu","Fanhu Zeng","Xiudong Wang","Yunhe Wang","Xinghao Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01806v1","updated":"2023-10-03T05:39:36Z","published":"2023-10-03T05:39:36Z","title":"Improvement and Enhancement of YOLOv5 Small Target Recognition Based on\n  Multi-module Optimization","summary":"  In this paper, the limitations of YOLOv5s model on small target detection\ntask are deeply studied and improved. The performance of the model is\nsuccessfully enhanced by introducing GhostNet-based convolutional module,\nRepGFPN-based Neck module optimization, CA and Transformer's attention\nmechanism, and loss function improvement using NWD. The experimental results\nvalidate the positive impact of these improvement strategies on model\nprecision, recall and mAP. In particular, the improved model shows significant\nsuperiority in dealing with complex backgrounds and tiny targets in real-world\napplication tests. This study provides an effective optimization strategy for\nthe YOLOv5s model on small target detection, and lays a solid foundation for\nfuture related research and applications.\n","authors":["Qingyang Li","Yuchen Li","Hongyi Duan","JiaLiang Kang","Jianan Zhang","Xueqian Gan","Ruotong Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01806v1.pdf","comment":"8 pages 10 figures"},{"id":"http://arxiv.org/abs/2310.01799v1","updated":"2023-10-03T05:05:35Z","published":"2023-10-03T05:05:35Z","title":"SMRD: SURE-based Robust MRI Reconstruction with Diffusion Models","summary":"  Diffusion models have recently gained popularity for accelerated MRI\nreconstruction due to their high sample quality. They can effectively serve as\nrich data priors while incorporating the forward model flexibly at inference\ntime, and they have been shown to be more robust than unrolled methods under\ndistribution shifts. However, diffusion models require careful tuning of\ninference hyperparameters on a validation set and are still sensitive to\ndistribution shifts during testing. To address these challenges, we introduce\nSURE-based MRI Reconstruction with Diffusion models (SMRD), a method that\nperforms test-time hyperparameter tuning to enhance robustness during testing.\nSMRD uses Stein's Unbiased Risk Estimator (SURE) to estimate the mean squared\nerror of the reconstruction during testing. SURE is then used to automatically\ntune the inference hyperparameters and to set an early stopping criterion\nwithout the need for validation tuning. To the best of our knowledge, SMRD is\nthe first to incorporate SURE into the sampling stage of diffusion models for\nautomatic hyperparameter selection. SMRD outperforms diffusion model baselines\non various measurement noise levels, acceleration factors, and anatomies,\nachieving a PSNR improvement of up to 6 dB under measurement noise. The code is\npublicly available at https://github.com/batuozt/SMRD .\n","authors":["Batu Ozturkler","Chao Liu","Benjamin Eckart","Morteza Mardani","Jiaming Song","Jan Kautz"],"pdf_url":"https://arxiv.org/pdf/2310.01799v1.pdf","comment":"To appear at MICCAI 2023"},{"id":"http://arxiv.org/abs/2305.04195v3","updated":"2023-10-03T04:42:09Z","published":"2023-05-07T05:40:48Z","title":"Cross-Modal Retrieval for Motion and Text via DopTriple Loss","summary":"  Cross-modal retrieval of image-text and video-text is a prominent research\narea in computer vision and natural language processing. However, there has\nbeen insufficient attention given to cross-modal retrieval between human motion\nand text, despite its wide-ranging applicability. To address this gap, we\nutilize a concise yet effective dual-unimodal transformer encoder for tackling\nthis task. Recognizing that overlapping atomic actions in different human\nmotion sequences can lead to semantic conflicts between samples, we explore a\nnovel triplet loss function called DropTriple Loss. This loss function discards\nfalse negative samples from the negative sample set and focuses on mining\nremaining genuinely hard negative samples for triplet training, thereby\nreducing violations they cause. We evaluate our model and approach on the\nHumanML3D and KIT Motion-Language datasets. On the latest HumanML3D dataset, we\nachieve a recall of 62.9% for motion retrieval and 71.5% for text retrieval\n(both based on R@10). The source code for our approach is publicly available at\nhttps://github.com/eanson023/rehamot.\n","authors":["Sheng Yan","Yang Liu","Haoqiang Wang","Xin Du","Mengyuan Liu","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2305.04195v3.pdf","comment":"This paper is accepted by ACM MM Asia 2023"},{"id":"http://arxiv.org/abs/2305.09241v5","updated":"2023-10-03T04:25:41Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n  Unexploitable Data with Learnable Examples","summary":"  Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v5.pdf","comment":"Accepted in MM 2023"},{"id":"http://arxiv.org/abs/2310.01779v1","updated":"2023-10-03T04:01:27Z","published":"2023-10-03T04:01:27Z","title":"HallE-Switch: Rethinking and Controlling Object Existence Hallucinations\n  in Large Vision Language Models for Detailed Caption","summary":"  Current large vision-language models (LVLMs) achieve remarkable progress, yet\nthere remains significant uncertainty regarding their ability to accurately\napprehend visual details, that is, in performing detailed captioning. To\naddress this, we introduce \\textit{CCEval}, a GPT-4 assisted evaluation method\ntailored for detailed captioning. Interestingly, while LVLMs demonstrate\nminimal object existence hallucination in existing VQA benchmarks, our proposed\nevaluation reveals continued susceptibility to such hallucinations. In this\npaper, we make the first attempt to investigate and attribute such\nhallucinations, including image resolution, the language decoder size, and\ninstruction data amount, quality, granularity. Our findings underscore the\nunwarranted inference when the language description includes details at a finer\nobject granularity than what the vision module can ground or verify, thus\ninducing hallucination. To control such hallucinations, we further attribute\nthe reliability of captioning to contextual knowledge (involving only\ncontextually grounded objects) and parametric knowledge (containing inferred\nobjects by the model). Thus, we introduce $\\textit{HallE-Switch}$, a\ncontrollable LVLM in terms of $\\textbf{Hall}$ucination in object\n$\\textbf{E}$xistence. HallE-Switch can condition the captioning to shift\nbetween (i) exclusively depicting contextual knowledge for grounded objects and\n(ii) blending it with parametric knowledge to imagine inferred objects. Our\nmethod reduces hallucination by 44% compared to LLaVA$_{7B}$ and maintains the\nsame object coverage.\n","authors":["Bohan Zhai","Shijia Yang","Xiangchen Zhao","Chenfeng Xu","Sheng Shen","Dongdi Zhao","Kurt Keutzer","Manling Li","Tan Yan","Xiangjun Fan"],"pdf_url":"https://arxiv.org/pdf/2310.01779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01755v1","updated":"2023-10-03T02:37:57Z","published":"2023-10-03T02:37:57Z","title":"ImageNet-OOD: Deciphering Modern Out-of-Distribution Detection\n  Algorithms","summary":"  The task of out-of-distribution (OOD) detection is notoriously ill-defined.\nEarlier works focused on new-class detection, aiming to identify label-altering\ndata distribution shifts, also known as \"semantic shift.\" However, recent works\nargue for a focus on failure detection, expanding the OOD evaluation framework\nto account for label-preserving data distribution shifts, also known as\n\"covariate shift.\" Intriguingly, under this new framework, complex OOD\ndetectors that were previously considered state-of-the-art now perform\nsimilarly to, or even worse than the simple maximum softmax probability\nbaseline. This raises the question: what are the latest OOD detectors actually\ndetecting? Deciphering the behavior of OOD detection algorithms requires\nevaluation datasets that decouples semantic shift and covariate shift. To aid\nour investigations, we present ImageNet-OOD, a clean semantic shift dataset\nthat minimizes the interference of covariate shift. Through comprehensive\nexperiments, we show that OOD detectors are more sensitive to covariate shift\nthan to semantic shift, and the benefits of recent OOD detection algorithms on\nsemantic shift detection is minimal. Our dataset and analyses provide important\ninsights for guiding the design of future OOD detectors.\n","authors":["William Yang","Byron Zhang","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2310.01755v1.pdf","comment":"28 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.08612v2","updated":"2023-10-03T02:07:30Z","published":"2023-04-17T20:59:49Z","title":"Bridging Discrete and Backpropagation: Straight-Through and Beyond","summary":"  Backpropagation, the cornerstone of deep learning, is limited to computing\ngradients for continuous variables. This limitation poses challenges for\nproblems involving discrete latent variables. To address this issue, we propose\na novel approach to approximate the gradient of parameters involved in\ngenerating discrete latent variables. First, we examine the widely used\nStraight-Through (ST) heuristic and demonstrate that it works as a first-order\napproximation of the gradient. Guided by our findings, we propose ReinMax,\nwhich achieves second-order accuracy by integrating Heun's method, a\nsecond-order numerical method for solving ODEs. ReinMax does not require\nHessian or other second-order derivatives, thus having negligible computation\noverheads. Extensive experimental results on various tasks demonstrate the\nsuperiority of ReinMax over the state of the art. Implementations are released\nat https://github.com/microsoft/ReinMax.\n","authors":["Liyuan Liu","Chengyu Dong","Xiaodong Liu","Bin Yu","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2304.08612v2.pdf","comment":"NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2309.17104v2","updated":"2023-10-03T01:51:26Z","published":"2023-09-29T10:06:05Z","title":"Prototype-guided Cross-modal Completion and Alignment for Incomplete\n  Text-based Person Re-identification","summary":"  Traditional text-based person re-identification (ReID) techniques heavily\nrely on fully matched multi-modal data, which is an ideal scenario. However,\ndue to inevitable data missing and corruption during the collection and\nprocessing of cross-modal data, the incomplete data issue is usually met in\nreal-world applications. Therefore, we consider a more practical task termed\nthe incomplete text-based ReID task, where person images and text descriptions\nare not completely matched and contain partially missing modality data. To this\nend, we propose a novel Prototype-guided Cross-modal Completion and Alignment\n(PCCA) framework to handle the aforementioned issues for incomplete text-based\nReID. Specifically, we cannot directly retrieve person images based on a text\nquery on missing modality data. Therefore, we propose the cross-modal nearest\nneighbor construction strategy for missing data by computing the cross-modal\nsimilarity between existing images and texts, which provides key guidance for\nthe completion of missing modal features. Furthermore, to efficiently complete\nthe missing modal features, we construct the relation graphs with the\naforementioned cross-modal nearest neighbor sets of missing modal data and the\ncorresponding prototypes, which can further enhance the generated missing modal\nfeatures. Additionally, for tighter fine-grained alignment between images and\ntexts, we raise a prototype-aware cross-modal alignment loss that can\neffectively reduce the modality heterogeneity gap for better fine-grained\nalignment in common space. Extensive experimental results on several benchmarks\nwith different missing ratios amply demonstrate that our method can\nconsistently outperform state-of-the-art text-image ReID approaches.\n","authors":["Tiantian Gong","Guodong Du","Junsheng Wang","Yongkang Ding","Liyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.17104v2.pdf","comment":"Sorry, some collaborators do not agree to publish it on Arxiv, so\n  please withdraw this paper"},{"id":"http://arxiv.org/abs/2310.01735v1","updated":"2023-10-03T01:50:48Z","published":"2023-10-03T01:50:48Z","title":"Learning Expected Appearances for Intraoperative Registration during\n  Neurosurgery","summary":"  We present a novel method for intraoperative patient-to-image registration by\nlearning Expected Appearances. Our method uses preoperative imaging to\nsynthesize patient-specific expected views through a surgical microscope for a\npredicted range of transformations. Our method estimates the camera pose by\nminimizing the dissimilarity between the intraoperative 2D view through the\noptical microscope and the synthesized expected texture. In contrast to\nconventional methods, our approach transfers the processing tasks to the\npreoperative stage, reducing thereby the impact of low-resolution, distorted,\nand noisy intraoperative images, that often degrade the registration accuracy.\nWe applied our method in the context of neuronavigation during brain surgery.\nWe evaluated our approach on synthetic data and on retrospective data from 6\nclinical cases. Our method outperformed state-of-the-art methods and achieved\naccuracies that met current clinical standards.\n","authors":["Nazim Haouchine","Reuben Dorent","Parikshit Juvekar","Erickson Torio","William M. Wells III","Tina Kapur","Alexandra J. Golby","Sarah Frisken"],"pdf_url":"https://arxiv.org/pdf/2310.01735v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2310.01712v1","updated":"2023-10-03T00:54:13Z","published":"2023-10-03T00:54:13Z","title":"Generative Autoencoding of Dropout Patterns","summary":"  We propose a generative model termed Deciphering Autoencoders. In this model,\nwe assign a unique random dropout pattern to each data point in the training\ndataset and then train an autoencoder to reconstruct the corresponding data\npoint using this pattern as information to be encoded. Since the training of\nDeciphering Autoencoders relies solely on reconstruction error, it offers more\nstable training than other generative models. Despite its simplicity,\nDeciphering Autoencoders show comparable sampling quality to DCGAN on the\nCIFAR-10 dataset.\n","authors":["Shunta Maeda"],"pdf_url":"https://arxiv.org/pdf/2310.01712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13196v2","updated":"2023-10-03T23:52:05Z","published":"2023-06-22T20:40:24Z","title":"DiMSam: Diffusion Models as Samplers for Task and Motion Planning under\n  Partial Observability","summary":"  Task and Motion Planning (TAMP) approaches are effective at planning\nlong-horizon autonomous robot manipulation. However, it can be difficult to\napply them to domains where the environment and its dynamics are not fully\nknown. We propose to overcome these limitations by leveraging deep generative\nmodeling, specifically diffusion models, to learn constraints and samplers that\ncapture these difficult-to-engineer aspects of the planning model. These\nlearned samplers are composed and combined within a TAMP solver in order to\nfind action parameter values jointly that satisfy the constraints along a plan.\nTo tractably make predictions for unseen objects in the environment, we define\nthese samplers on low-dimensional learned latent embeddings of changing object\nstate. We evaluate our approach in an articulated object manipulation domain\nand show how the combination of classical TAMP, generative learning, and latent\nembeddings enables long-horizon constraint-based reasoning. We also apply the\nlearned sampler in the real world. More details are available at\nhttps://sites.google.com/view/dimsam-tamp\n","authors":["Xiaolin Fang","Caelan Reed Garrett","Clemens Eppner","Tomás Lozano-Pérez","Leslie Pack Kaelbling","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2306.13196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02492v1","updated":"2023-10-03T23:44:35Z","published":"2023-10-03T23:44:35Z","title":"Eye Fairness: A Large-Scale 3D Imaging Dataset for Equitable Eye\n  Diseases Screening and Fair Identity Scaling","summary":"  Fairness or equity in machine learning is profoundly important for societal\nwell-being, but limited public datasets hinder its progress, especially in the\narea of medicine. It is undeniable that fairness in medicine is one of the most\nimportant areas for fairness learning's applications. Currently, no large-scale\npublic medical datasets with 3D imaging data for fairness learning are\navailable, while 3D imaging data in modern clinics are standard tests for\ndisease diagnosis. In addition, existing medical fairness datasets are actually\nrepurposed datasets, and therefore they typically have limited demographic\nidentity attributes with at most three identity attributes of age, gender, and\nrace for fairness modeling. To address this gap, we introduce our Eye Fairness\ndataset with 30,000 subjects (Harvard-EF) covering three major eye diseases\nincluding age-related macular degeneration, diabetic retinopathy, and glaucoma\naffecting 380 million patients globally. Our Harvard-EF dataset includes both\n2D fundus photos and 3D optical coherence tomography scans with six demographic\nidentity attributes including age, gender, race, ethnicity, preferred language,\nand marital status. We also propose a fair identity scaling (FIS) approach\ncombining group and individual scaling together to improve model fairness. Our\nFIS approach is compared with various state-of-the-art fairness learning\nmethods with superior performance in the racial, gender, and ethnicity fairness\ntasks with 2D and 3D imaging data, which demonstrate the utilities of our\nHarvard-EF dataset for fairness learning. To facilitate fairness comparisons\nbetween different models, we propose performance-scaled disparity measures,\nwhich can be used to compare model fairness accounting for overall performance\nlevels. The dataset and code are publicly accessible via\n\\url{https://ophai.hms.harvard.edu/datasets/harvard-ef30k}.\n","authors":["Yan Luo","Yu Tian","Min Shi","Tobias Elze","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02486v1","updated":"2023-10-03T23:25:19Z","published":"2023-10-03T23:25:19Z","title":"OCU-Net: A Novel U-Net Architecture for Enhanced Oral Cancer\n  Segmentation","summary":"  Accurate detection of oral cancer is crucial for improving patient outcomes.\nHowever, the field faces two key challenges: the scarcity of deep\nlearning-based image segmentation research specifically targeting oral cancer\nand the lack of annotated data. Our study proposes OCU-Net, a pioneering U-Net\nimage segmentation architecture exclusively designed to detect oral cancer in\nhematoxylin and eosin (H&E) stained image datasets. OCU-Net incorporates\nadvanced deep learning modules, such as the Channel and Spatial Attention\nFusion (CSAF) module, a novel and innovative feature that emphasizes important\nchannel and spatial areas in H&E images while exploring contextual information.\nIn addition, OCU-Net integrates other innovative components such as\nSqueeze-and-Excite (SE) attention module, Atrous Spatial Pyramid Pooling (ASPP)\nmodule, residual blocks, and multi-scale fusion. The incorporation of these\nmodules showed superior performance for oral cancer segmentation for two\ndatasets used in this research. Furthermore, we effectively utilized the\nefficient ImageNet pre-trained MobileNet-V2 model as a backbone of our OCU-Net\nto create OCU-Netm, an enhanced version achieving state-of-the-art results.\nComprehensive evaluation demonstrates that OCU-Net and OCU-Netm outperformed\nexisting segmentation methods, highlighting their precision in identifying\ncancer cells in H&E images from OCDC and ORCA datasets.\n","authors":["Ahmed Albishri","Syed Jawad Hussain Shah","Yugyung Lee","Rong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14779v2","updated":"2023-10-03T23:01:05Z","published":"2023-05-24T06:35:26Z","title":"Alt-Text with Context: Improving Accessibility for Images on Twitter","summary":"  In this work we present an approach for generating alternative text (or\nalt-text) descriptions for images shared on social media, specifically Twitter.\nMore than just a special case of image captioning, alt-text is both more\nliterally descriptive and context-specific. Also critically, images posted to\nTwitter are often accompanied by user-written text that despite not necessarily\ndescribing the image may provide useful context that if properly leveraged can\nbe informative. We address this task with a multimodal model that conditions on\nboth textual information from the associated social media post as well as\nvisual signal from the image, and demonstrate that the utility of these two\ninformation sources stacks. We put forward a new dataset of 371k images paired\nwith alt-text and tweets scraped from Twitter and evaluate on it across a\nvariety of automated metrics as well as human evaluation. We show that our\napproach of conditioning on both tweet text and visual information\nsignificantly outperforms prior work, by more than 2x on BLEU@4.\n","authors":["Nikita Srivatsan","Sofia Samaniego","Omar Florez","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2305.14779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00645v4","updated":"2023-10-03T22:35:22Z","published":"2022-06-01T17:29:26Z","title":"Floorplan Restoration by Structure Hallucinating Transformer Cascades","summary":"  This paper presents an extreme floorplan reconstruction task, a new benchmark\nfor the task, and a neural architecture as a solution. Given a partial\nfloorplan reconstruction inferred or curated from panorama images, the task is\nto reconstruct a complete floorplan including invisible architectural\nstructures. The proposed neural network 1) encodes an input partial floorplan\ninto a set of latent vectors by convolutional neural networks and a\nTransformer; and 2) reconstructs an entire floorplan while hallucinating\ninvisible rooms and doors by cascading Transformer decoders. Qualitative and\nquantitative evaluations demonstrate effectiveness of our approach over the\nbenchmark of 701 houses, outperforming the state-of-the-art reconstruction\ntechniques. We will share our code, models, and data.\n","authors":["Sepidehsadat Hosseini","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2206.00645v4.pdf","comment":"Published at BMVC 2023"},{"id":"http://arxiv.org/abs/2211.13785v3","updated":"2023-10-03T22:29:43Z","published":"2022-11-24T20:06:11Z","title":"PuzzleFusion: Unleashing the Power of Diffusion Models for Spatial\n  Puzzle Solving","summary":"  This paper presents an end-to-end neural architecture based on Diffusion\nModels for spatial puzzle solving, particularly jigsaw puzzle and room\narrangement tasks. In the latter task, for instance, the proposed system\n\"PuzzleFusion\" takes a set of room layouts as polygonal curves in the top-down\nview and aligns the room layout pieces by estimating their 2D translations and\nrotations, akin to solving the jigsaw puzzle of room layouts. A surprising\ndiscovery of the paper is that the simple use of a Diffusion Model effectively\nsolves these challenging spatial puzzle tasks as a conditional generation\nprocess. To enable learning of an end-to-end neural system, the paper\nintroduces new datasets with ground-truth arrangements: 1) 2D Voronoi jigsaw\ndataset, a synthetic one where pieces are generated by Voronoi diagram of 2D\npointset; and 2) MagicPlan dataset, a real one offered by MagicPlan from its\nproduction pipeline, where pieces are room layouts constructed by augmented\nreality App by real-estate consumers. The qualitative and quantitative\nevaluations demonstrate that our approach outperforms the competing methods by\nsignificant margins in all the tasks.\n","authors":["Sepidehsadat Hosseini","Mohammad Amin Shabani","Saghar Irandoust","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2211.13785v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08011v2","updated":"2023-10-03T22:12:08Z","published":"2023-08-15T19:50:38Z","title":"Shortcut-V2V: Compression Framework for Video-to-Video Translation based\n  on Temporal Redundancy Reduction","summary":"  Video-to-video translation aims to generate video frames of a target domain\nfrom an input video. Despite its usefulness, the existing networks require\nenormous computations, necessitating their model compression for wide use.\nWhile there exist compression methods that improve computational efficiency in\nvarious image/video tasks, a generally-applicable compression method for\nvideo-to-video translation has not been studied much. In response, we present\nShortcut-V2V, a general-purpose compression framework for video-to-video\ntranslation. Shourcut-V2V avoids full inference for every neighboring video\nframe by approximating the intermediate features of a current frame from those\nof the previous frame. Moreover, in our framework, a newly-proposed block\ncalled AdaBD adaptively blends and deforms features of neighboring frames,\nwhich makes more accurate predictions of the intermediate features possible. We\nconduct quantitative and qualitative evaluations using well-known\nvideo-to-video translation models on various tasks to demonstrate the general\napplicability of our framework. The results show that Shourcut-V2V achieves\ncomparable performance compared to the original video-to-video translation\nmodel while saving 3.2-5.7x computational cost and 7.8-44x memory at test time.\n","authors":["Chaeyeon Chung","Yeojeong Park","Seunghwan Choi","Munkhsoyol Ganbat","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.08011v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05148v2","updated":"2023-10-03T22:10:02Z","published":"2023-09-10T21:52:47Z","title":"Beyond Skin Tone: A Multidimensional Measure of Apparent Skin Color","summary":"  This paper strives to measure apparent skin color in computer vision, beyond\na unidimensional scale on skin tone. In their seminal paper Gender Shades,\nBuolamwini and Gebru have shown how gender classification systems can be biased\nagainst women with darker skin tones. Subsequently, fairness researchers and\npractitioners have adopted the Fitzpatrick skin type classification as a common\nmeasure to assess skin color bias in computer vision systems. While effective,\nthe Fitzpatrick scale only focuses on the skin tone ranging from light to dark.\nTowards a more comprehensive measure of skin color, we introduce the hue angle\nranging from red to yellow. When applied to images, the hue dimension reveals\nadditional biases related to skin color in both computer vision datasets and\nmodels. We then recommend multidimensional skin color scales, relying on both\nskin tone and hue, for fairness assessments.\n","authors":["William Thong","Przemyslaw Joniak","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2309.05148v2.pdf","comment":"Accepted at the International Conference on Computer Vision (ICCV)\n  2023"},{"id":"http://arxiv.org/abs/2309.10829v2","updated":"2023-10-03T21:45:52Z","published":"2023-09-16T11:58:04Z","title":"Comparative study of Deep Learning Models for Binary Classification on\n  Combined Pulmonary Chest X-ray Dataset","summary":"  CNN-based deep learning models for disease detection have become popular\nrecently. We compared the binary classification performance of eight prominent\ndeep learning models: DenseNet 121, DenseNet 169, DenseNet 201, EffecientNet\nb0, EffecientNet lite4, GoogleNet, MobileNet, and ResNet18 for their binary\nclassification performance on combined Pulmonary Chest Xrays dataset. Despite\nthe widespread application in different fields in medical images, there remains\na knowledge gap in determining their relative performance when applied to the\nsame dataset, a gap this study aimed to address. The dataset combined Shenzhen,\nChina (CH) and Montgomery, USA (MC) data. We trained our model for binary\nclassification, calculated different parameters of the mentioned models, and\ncompared them. The models were trained to keep in mind all following the same\ntraining parameters to maintain a controlled comparison environment. End of the\nstudy, we found a distinct difference in performance among the other models\nwhen applied to the pulmonary chest Xray image dataset, where DenseNet169\nperformed with 89.38 percent and MobileNet with 92.2 percent precision.\n  Keywords: Pulmonary, Deep Learning, Tuberculosis, Disease detection, Xray\n","authors":["Shabbir Ahmed Shuvo","Md Aminul Islam","Md. Mozammel Hoque","Rejwan Bin Sulaiman"],"pdf_url":"https://arxiv.org/pdf/2309.10829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12594v2","updated":"2023-10-03T21:31:01Z","published":"2023-09-22T02:46:43Z","title":"DeFormer: Integrating Transformers with Deformable Models for 3D Shape\n  Abstraction from a Single Image","summary":"  Accurate 3D shape abstraction from a single 2D image is a long-standing\nproblem in computer vision and graphics. By leveraging a set of primitives to\nrepresent the target shape, recent methods have achieved promising results.\nHowever, these methods either use a relatively large number of primitives or\nlack geometric flexibility due to the limited expressibility of the primitives.\nIn this paper, we propose a novel bi-channel Transformer architecture,\nintegrated with parameterized deformable models, termed DeFormer, to\nsimultaneously estimate the global and local deformations of primitives. In\nthis way, DeFormer can abstract complex object shapes while using a small\nnumber of primitives which offer a broader geometry coverage and finer details.\nThen, we introduce a force-driven dynamic fitting and a cycle-consistent\nre-projection loss to optimize the primitive parameters. Extensive experiments\non ShapeNet across various settings show that DeFormer achieves better\nreconstruction accuracy over the state-of-the-art, and visualizes with\nconsistent semantic correspondences for improved interpretability.\n","authors":["Di Liu","Xiang Yu","Meng Ye","Qilong Zhangli","Zhuowei Li","Zhixing Zhang","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2309.12594v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2104.12294v2","updated":"2023-10-03T21:18:00Z","published":"2021-04-26T00:37:11Z","title":"Wise-SrNet: A Novel Architecture for Enhancing Image Classification by\n  Learning Spatial Resolution of Feature Maps","summary":"  One of the main challenges since the advancement of convolutional neural\nnetworks is how to connect the extracted feature map to the final\nclassification layer. VGG models used two sets of fully connected layers for\nthe classification part of their architectures, which significantly increased\nthe number of models' weights. ResNet and the next deep convolutional models\nused the Global Average Pooling (GAP) layer to compress the feature map and\nfeed it to the classification layer. Although using the GAP layer reduces the\ncomputational cost, but also causes losing spatial resolution of the feature\nmap, which results in decreasing learning efficiency. In this paper, we aim to\ntackle this problem by replacing the GAP layer with a new architecture called\nWise-SrNet. It is inspired by the depthwise convolutional idea and is designed\nfor processing spatial resolution while not increasing computational cost. We\nhave evaluated our method using three different datasets: Intel Image\nClassification Challenge, MIT Indoors Scenes, and a part of the ImageNet\ndataset. We investigated the implementation of our architecture on several\nmodels of the Inception, ResNet, and DenseNet families. Applying our\narchitecture has revealed a significant effect on increasing convergence speed\nand accuracy. Our Experiments on images with 224*224 resolution increased the\nTop-1 accuracy between 2% to 8% on different datasets and models. Running our\nmodels on 512*512 resolution images of the MIT Indoors Scenes dataset showed a\nnotable result of improving the Top-1 accuracy within 3% to 26%. We will also\ndemonstrate the GAP layer's disadvantage when the input images are large and\nthe number of classes is not few. In this circumstance, our proposed\narchitecture can do a great help in enhancing classification results. The code\nis shared at https://github.com/mr7495/image-classification-spatial.\n","authors":["Mohammad Rahimzadeh","AmirAli Askari","Soroush Parvin","Elnaz Safi","Mohammad Reza Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2104.12294v2.pdf","comment":"The code is shared at\n  https://github.com/mr7495/image-classification-spatial"},{"id":"http://arxiv.org/abs/2310.02437v1","updated":"2023-10-03T21:08:41Z","published":"2023-10-03T21:08:41Z","title":"EvDNeRF: Reconstructing Event Data with Dynamic Neural Radiance Fields","summary":"  We present EvDNeRF, a pipeline for generating event data and training an\nevent-based dynamic NeRF, for the purpose of faithfully reconstructing\neventstreams on scenes with rigid and non-rigid deformations that may be too\nfast to capture with a standard camera. Event cameras register asynchronous\nper-pixel brightness changes at MHz rates with high dynamic range, making them\nideal for observing fast motion with almost no motion blur. Neural radiance\nfields (NeRFs) offer visual-quality geometric-based learnable rendering, but\nprior work with events has only considered reconstruction of static scenes. Our\nEvDNeRF can predict eventstreams of dynamic scenes from a static or moving\nviewpoint between any desired timestamps, thereby allowing it to be used as an\nevent-based simulator for a given scene. We show that by training on varied\nbatch sizes of events, we can improve test-time predictions of events at fine\ntime resolutions, outperforming baselines that pair standard dynamic NeRFs with\nevent simulators. We release our simulated and real datasets, as well as code\nfor both event-based data generation and the training of event-based dynamic\nNeRF models (https://github.com/anish-bhattacharya/EvDNeRF).\n","authors":["Anish Bhattacharya","Ratnesh Madaan","Fernando Cladera","Sai Vemprala","Rogerio Bonatti","Kostas Daniilidis","Ashish Kapoor","Vijay Kumar","Nikolai Matni","Jayesh K. Gupta"],"pdf_url":"https://arxiv.org/pdf/2310.02437v1.pdf","comment":"17 pages, 20 figures, 2 tables"},{"id":"http://arxiv.org/abs/2302.05624v2","updated":"2023-10-03T21:01:30Z","published":"2023-02-11T08:20:17Z","title":"A novel approach to generate datasets with XAI ground truth to evaluate\n  image models","summary":"  With the increased usage of artificial intelligence (AI), it is imperative to\nunderstand how these models work internally. These needs have led to the\ndevelopment of a new field called eXplainable artificial intelligence (XAI).\nThis field consists of on a set of techniques that allows us to theoretically\ndetermine the cause of the AI decisions. One main issue of XAI is how to verify\nthe works on this field, taking into consideration the lack of ground truth\n(GT). In this study, we propose a new method to generate datasets with GT. We\nconducted a set of experiments that compared our GT with real model\nexplanations and obtained excellent results confirming that our proposed method\nis correct.\n","authors":["Miquel Miró-Nicolau","Antoni Jaume-i-Capó","Gabriel Moyà-Alcover"],"pdf_url":"https://arxiv.org/pdf/2302.05624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02426v1","updated":"2023-10-03T20:46:10Z","published":"2023-10-03T20:46:10Z","title":"EditVal: Benchmarking Diffusion Based Text-Guided Image Editing Methods","summary":"  A plethora of text-guided image editing methods have recently been developed\nby leveraging the impressive capabilities of large-scale diffusion-based\ngenerative models such as Imagen and Stable Diffusion. A standardized\nevaluation protocol, however, does not exist to compare methods across\ndifferent types of fine-grained edits. To address this gap, we introduce\nEditVal, a standardized benchmark for quantitatively evaluating text-guided\nimage editing methods. EditVal consists of a curated dataset of images, a set\nof editable attributes for each image drawn from 13 possible edit types, and an\nautomated evaluation pipeline that uses pre-trained vision-language models to\nassess the fidelity of generated images for each edit type. We use EditVal to\nbenchmark 8 cutting-edge diffusion-based editing methods including SINE, Imagic\nand Instruct-Pix2Pix. We complement this with a large-scale human study where\nwe show that EditVall's automated evaluation pipeline is strongly correlated\nwith human-preferences for the edit types we considered. From both the human\nstudy and automated evaluation, we find that: (i) Instruct-Pix2Pix, Null-Text\nand SINE are the top-performing methods averaged across different edit types,\nhowever {\\it only} Instruct-Pix2Pix and Null-Text are able to preserve original\nimage properties; (ii) Most of the editing methods fail at edits involving\nspatial operations (e.g., changing the position of an object). (iii) There is\nno `winner' method which ranks the best individually across a range of\ndifferent edit types. We hope that our benchmark can pave the way to developing\nmore reliable text-guided image editing tools in the future. We will publicly\nrelease EditVal, and all associated code and human-study templates to support\nthese research directions in https://deep-ml-research.github.io/editval/.\n","authors":["Samyadeep Basu","Mehrdad Saberi","Shweta Bhardwaj","Atoosa Malemir Chegini","Daniela Massiceti","Maziar Sanjabi","Shell Xu Hu","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2310.02426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03364v2","updated":"2023-10-03T20:43:50Z","published":"2023-06-06T02:38:01Z","title":"Learning Representations on the Unit Sphere: Investigating Angular\n  Gaussian and von Mises-Fisher Distributions for Online Continual Learning","summary":"  We use the maximum a posteriori estimation principle for learning\nrepresentations distributed on the unit sphere. We propose to use the angular\nGaussian distribution, which corresponds to a Gaussian projected on the\nunit-sphere and derive the associated loss function. We also consider the von\nMises-Fisher distribution, which is the conditional of a Gaussian in the\nunit-sphere. The learned representations are pushed toward fixed directions,\nwhich are the prior means of the Gaussians; allowing for a learning strategy\nthat is resilient to data drift. This makes it suitable for online continual\nlearning, which is the problem of training neural networks on a continuous data\nstream, where multiple classification tasks are presented sequentially so that\ndata from past tasks are no longer accessible, and data from the current task\ncan be seen only once. To address this challenging scenario, we propose a\nmemory-based representation learning technique equipped with our new loss\nfunctions. Our approach does not require negative data or knowledge of task\nboundaries and performs well with smaller batch sizes while being\ncomputationally efficient. We demonstrate with extensive experiments that the\nproposed method outperforms the current state-of-the-art methods on both\nstandard evaluation scenarios and realistic scenarios with blurry task\nboundaries. For reproducibility, we use the same training pipeline for every\ncompared method and share the code at https://t.ly/SQTj.\n","authors":["Nicolas Michel","Giovanni Chierchia","Romain Negrel","Jean-François Bercher"],"pdf_url":"https://arxiv.org/pdf/2306.03364v2.pdf","comment":"17 pages, under review"},{"id":"http://arxiv.org/abs/2310.02420v1","updated":"2023-10-03T20:34:01Z","published":"2023-10-03T20:34:01Z","title":"FedL2P: Federated Learning to Personalize","summary":"  Federated learning (FL) research has made progress in developing algorithms\nfor distributed learning of global models, as well as algorithms for local\npersonalization of those common models to the specifics of each client's local\ndata distribution. However, different FL problems may require different\npersonalization strategies, and it may not even be possible to define an\neffective one-size-fits-all personalization strategy for all clients: depending\non how similar each client's optimal predictor is to that of the global model,\ndifferent personalization strategies may be preferred. In this paper, we\nconsider the federated meta-learning problem of learning personalization\nstrategies. Specifically, we consider meta-nets that induce the batch-norm and\nlearning rate parameters for each client given local data statistics. By\nlearning these meta-nets through FL, we allow the whole FL network to\ncollaborate in learning a customized personalization strategy for each client.\nEmpirical results show that this framework improves on a range of standard\nhand-crafted personalization baselines in both label and feature shift\nsituations.\n","authors":["Royson Lee","Minyoung Kim","Da Li","Xinchi Qiu","Timothy Hospedales","Ferenc Huszár","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2310.02420v1.pdf","comment":"Accepted at the 37th Conference on Neural Information Processing\n  Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2310.02416v1","updated":"2023-10-03T20:28:09Z","published":"2023-10-03T20:28:09Z","title":"Bag of Tricks for Fully Test-Time Adaptation","summary":"  Fully Test-Time Adaptation (TTA), which aims at adapting models to data\ndrifts, has recently attracted wide interest. Numerous tricks and techniques\nhave been proposed to ensure robust learning on arbitrary streams of unlabeled\ndata. However, assessing the true impact of each individual technique and\nobtaining a fair comparison still constitutes a significant challenge. To help\nconsolidate the community's knowledge, we present a categorization of selected\northogonal TTA techniques, including small batch normalization, stream\nrebalancing, reliable sample selection, and network confidence calibration. We\nmeticulously dissect the effect of each approach on different scenarios of\ninterest. Through our analysis, we shed light on trade-offs induced by those\ntechniques between accuracy, the computational power required, and model\ncomplexity. We also uncover the synergy that arises when combining techniques\nand are able to establish new state-of-the-art results.\n","authors":["Saypraseuth Mounsaveng","Florent Chiaroni","Malik Boudiaf","Marco Pedersoli","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2310.02416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02299v3","updated":"2023-10-03T20:10:07Z","published":"2023-05-03T17:48:55Z","title":"Dynamic Sparse Training with Structured Sparsity","summary":"  Dynamic Sparse Training (DST) methods achieve state-of-the-art results in\nsparse neural network training, matching the generalization of dense models\nwhile enabling sparse training and inference. Although the resulting models are\nhighly sparse and theoretically less computationally expensive, achieving\nspeedups with unstructured sparsity on real-world hardware is challenging. In\nthis work, we propose a sparse-to-sparse DST method, Structured RigL (SRigL),\nto learn a variant of fine-grained structured N:M sparsity by imposing a\nconstant fan-in constraint. Using our empirical analysis of existing DST\nmethods at high sparsity, we additionally employ a neuron ablation method which\nenables SRigL to achieve state-of-the-art sparse-to-sparse structured DST\nperformance on a variety of Neural Network (NN) architectures. We demonstrate\nreduced real-world timings on CPU for online inference -- 3.6x/2x faster at 90%\nsparsity than equivalent dense/unstructured sparse layers, respectively. Our\nsource code is available at https://github.com/calgaryml/condensed-sparsity\n","authors":["Mike Lasby","Anna Golubeva","Utku Evci","Mihai Nica","Yani Ioannou"],"pdf_url":"https://arxiv.org/pdf/2305.02299v3.pdf","comment":"24 pages, 14 figures"},{"id":"http://arxiv.org/abs/2310.02401v1","updated":"2023-10-03T19:50:08Z","published":"2023-10-03T19:50:08Z","title":"FT-Shield: A Watermark Against Unauthorized Fine-tuning in Text-to-Image\n  Diffusion Models","summary":"  Text-to-image generative models based on latent diffusion models (LDM) have\ndemonstrated their outstanding ability in generating high-quality and\nhigh-resolution images according to language prompt. Based on these powerful\nlatent diffusion models, various fine-tuning methods have been proposed to\nachieve the personalization of text-to-image diffusion models such as artistic\nstyle adaptation and human face transfer. However, the unauthorized usage of\ndata for model personalization has emerged as a prevalent concern in relation\nto copyright violations. For example, a malicious user may use the fine-tuning\ntechnique to generate images which mimic the style of a painter without his/her\npermission. In light of this concern, we have proposed FT-Shield, a\nwatermarking approach specifically designed for the fine-tuning of\ntext-to-image diffusion models to aid in detecting instances of infringement.\nWe develop a novel algorithm for the generation of the watermark to ensure that\nthe watermark on the training images can be quickly and accurately transferred\nto the generated images of text-to-image diffusion models. A watermark will be\ndetected on an image by a binary watermark detector if the image is generated\nby a model that has been fine-tuned using the protected watermarked images.\nComprehensive experiments were conducted to validate the effectiveness of\nFT-Shield.\n","authors":["Yingqian Cui","Jie Ren","Yuping Lin","Han Xu","Pengfei He","Yue Xing","Wenqi Fan","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02386v1","updated":"2023-10-03T19:13:43Z","published":"2023-10-03T19:13:43Z","title":"ScaleNet: An Unsupervised Representation Learning Method for Limited\n  Information","summary":"  Although large-scale labeled data are essential for deep convolutional neural\nnetworks (ConvNets) to learn high-level semantic visual representations, it is\ntime-consuming and impractical to collect and annotate large-scale datasets. A\nsimple and efficient unsupervised representation learning method named ScaleNet\nbased on multi-scale images is proposed in this study to enhance the\nperformance of ConvNets when limited information is available. The input images\nare first resized to a smaller size and fed to the ConvNet to recognize the\nrotation degree. Next, the ConvNet learns the rotation-prediction task for the\noriginal size images based on the parameters transferred from the previous\nmodel. The CIFAR-10 and ImageNet datasets are examined on different\narchitectures such as AlexNet and ResNet50 in this study. The current study\ndemonstrates that specific image features, such as Harris corner information,\nplay a critical role in the efficiency of the rotation-prediction task. The\nScaleNet supersedes the RotNet by ~7% in the limited CIFAR-10 dataset. The\ntransferred parameters from a ScaleNet model with limited data improve the\nImageNet Classification task by about 6% compared to the RotNet model. This\nstudy shows the capability of the ScaleNet method to improve other cutting-edge\nmodels such as SimCLR by learning effective features for classification tasks.\n","authors":["Huili Huang","M. Mahdi Roozbahani"],"pdf_url":"https://arxiv.org/pdf/2310.02386v1.pdf","comment":"Accepted by DAGM GCPR 2021"},{"id":"http://arxiv.org/abs/2310.02381v1","updated":"2023-10-03T19:05:00Z","published":"2023-10-03T19:05:00Z","title":"Multi-Prompt Fine-Tuning of Foundation Models for Enhanced Medical Image\n  Segmentation","summary":"  The Segment Anything Model (SAM) is a powerful foundation model that\nintroduced revolutionary advancements in natural image segmentation. However,\nits performance remains sub-optimal when delineating the intricate structure of\nbiomedical images, where multiple organs and tissues intertwine in a single\nimage. In this study, we introduce a novel fine-tuning framework that leverages\nSAM's ability to bundle and process multiple prompts per image and seeks to\nimprove SAM's performance in medical images. We first curated a medical image\ndataset that consists of CT scans of lesions in various organs, each with two\nannotations for organs and lesions respectively. Then, we fine-tuned SAM's mask\ndecoder within our framework by batching both bounding boxes generated from\nground truth masks as reference. The batched prompt strategy we introduced not\nonly addresses the inherent complexity and ambiguity often found in medical\nimages but also substantially enhances performance metrics when applied onto a\nwide range of segmentation tasks.\n","authors":["Xiangru Li","Yifei Zhang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.02381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11203v2","updated":"2023-10-03T18:32:44Z","published":"2023-05-18T16:57:10Z","title":"PDP: Parameter-free Differentiable Pruning is All You Need","summary":"  DNN pruning is a popular way to reduce the size of a model, improve the\ninference latency, and minimize the power consumption on DNN accelerators.\nHowever, existing approaches might be too complex, expensive or ineffective to\napply to a variety of vision/language tasks, DNN architectures and to honor\nstructured pruning constraints. In this paper, we propose an efficient yet\neffective train-time pruning scheme, Parameter-free Differentiable Pruning\n(PDP), which offers state-of-the-art qualities in model size, accuracy, and\ntraining cost. PDP uses a dynamic function of weights during training to\ngenerate soft pruning masks for the weights in a parameter-free manner for a\ngiven pruning target. While differentiable, the simplicity and efficiency of\nPDP make it universal enough to deliver state-of-the-art\nrandom/structured/channel pruning results on various vision and natural\nlanguage tasks. For example, for MobileNet-v1, PDP can achieve 68.2% top-1\nImageNet1k accuracy at 86.6% sparsity, which is 1.7% higher accuracy than those\nfrom the state-of-the-art algorithms. Also, PDP yields over 83.1% accuracy on\nMulti-Genre Natural Language Inference with 90% sparsity for BERT, while the\nnext best from the existing techniques shows 81.5% accuracy. In addition, PDP\ncan be applied to structured pruning, such as N:M pruning and channel pruning.\nFor 1:4 structured pruning of ResNet18, PDP improved the top-1 ImageNet1k\naccuracy by over 3.6% over the state-of-the-art. For channel pruning of\nResNet50, PDP reduced the top-1 ImageNet1k accuracy by 0.6% from the\nstate-of-the-art.\n","authors":["Minsik Cho","Saurabh Adya","Devang Naik"],"pdf_url":"https://arxiv.org/pdf/2305.11203v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.02143v1","updated":"2023-10-03T15:29:37Z","published":"2023-10-03T15:29:37Z","title":"CORec-Cri: How collaborative and social technologies can help to\n  contextualize crises?","summary":"  Crisis situations can present complex and multifaceted challenges, often\nrequiring the involvement of multiple organizations and stakeholders with\nvarying areas of expertise, responsibilities, and resources. Acquiring accurate\nand timely information about impacted areas is crucial to effectively respond\nto these crises. In this paper, we investigate how collaborative and social\ntechnologies help to contextualize crises, including identifying impacted areas\nand real-time needs. To this end, we define CORec-Cri (Contextulized\nOntology-based Recommender system for crisis management) based on existing\nwork. Our motivation for this approach is two-fold: first, effective\ncollaboration among stakeholders is essential for efficient and coordinated\ncrisis response; second, social computing facilitates interaction, information\nflow, and collaboration among stakeholders. We detail the key components of our\nsystem design, highlighting its potential to support decision-making, resource\nallocation, and communication among stakeholders. Finally, we provide examples\nof how our system can be applied to contextualize crises to improve crisis\nmanagement.\n","authors":["Ngoc Luyen Le","Jinfeng Zhong","Elsa Negre","Marie-Hélène Abel"],"pdf_url":"https://arxiv.org/pdf/2310.02143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.03459v3","updated":"2023-10-03T14:55:06Z","published":"2021-09-08T07:00:45Z","title":"Dual Correction Strategy for Ranking Distillation in Top-N Recommender\n  System","summary":"  Knowledge Distillation (KD), which transfers the knowledge of a well-trained\nlarge model (teacher) to a small model (student), has become an important area\nof research for practical deployment of recommender systems. Recently, Relaxed\nRanking Distillation (RRD) has shown that distilling the ranking information in\nthe recommendation list significantly improves the performance. However, the\nmethod still has limitations in that 1) it does not fully utilize the\nprediction errors of the student model, which makes the training not fully\nefficient, and 2) it only distills the user-side ranking information, which\nprovides an insufficient view under the sparse implicit feedback. This paper\npresents Dual Correction strategy for Distillation (DCD), which transfers the\nranking information from the teacher model to the student model in a more\nefficient manner. Most importantly, DCD uses the discrepancy between the\nteacher model and the student model predictions to decide which knowledge to be\ndistilled. By doing so, DCD essentially provides the learning guidance tailored\nto \"correcting\" what the student model has failed to accurately predict. This\nprocess is applied for transferring the ranking information from the user-side\nas well as the item-side to address sparse implicit user feedback. Our\nexperiments show that the proposed method outperforms the state-of-the-art\nbaselines, and ablation studies validate the effectiveness of each component.\n","authors":["Youngjune Lee","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2109.03459v3.pdf","comment":"CIKM 2021"},{"id":"http://arxiv.org/abs/2308.16708v2","updated":"2023-10-03T14:14:43Z","published":"2023-08-31T13:24:57Z","title":"Concentrating on the Impact: Consequence-based Explanations in\n  Recommender Systems","summary":"  Recommender systems assist users in decision-making, where the presentation\nof recommended items and their explanations are critical factors for enhancing\nthe overall user experience. Although various methods for generating\nexplanations have been proposed, there is still room for improvement,\nparticularly for users who lack expertise in a specific item domain. In this\nstudy, we introduce the novel concept of \\textit{consequence-based\nexplanations}, a type of explanation that emphasizes the individual impact of\nconsuming a recommended item on the user, which makes the effect of following\nrecommendations clearer. We conducted an online user study to examine our\nassumption about the appreciation of consequence-based explanations and their\nimpacts on different explanation aims in recommender systems. Our findings\nhighlight the importance of consequence-based explanations, which were\nwell-received by users and effectively improved user satisfaction in\nrecommender systems. These results provide valuable insights for designing\nengaging explanations that can enhance the overall user experience in\ndecision-making.\n","authors":["Sebastian Lubos","Thi Ngoc Trang Tran","Seda Polat Erdeniz","Merfat El Mansi","Alexander Felfernig","Manfred Wundara","Gerhard Leitner"],"pdf_url":"https://arxiv.org/pdf/2308.16708v2.pdf","comment":"Preprint of the paper to be presented at IntRS'23: Joint Workshop on\n  Interfaces and Human Decision Making for Recommender Systems, September 18,\n  2023, Singapore. paper will be published in the workshop proceedings"},{"id":"http://arxiv.org/abs/2310.01978v1","updated":"2023-10-03T11:42:29Z","published":"2023-10-03T11:42:29Z","title":"Online Multimedia Verification with Computational Tools and OSINT:\n  Russia-Ukraine Conflict Case Studies","summary":"  This paper investigates the use of computational tools and Open-Source\nIntelligence (OSINT) techniques for verifying online multimedia content, with a\nspecific focus on real-world cases from the Russia-Ukraine conflict. Over a\nnine-month period from April to December 2022, we examine verification\nworkflows, tools, and case studies published by \\faktiskbar. Our study\nshowcases the effectiveness of diverse resources, including AI tools,\ngeolocation tools, internet archives, and social media monitoring platforms, in\nenabling journalists and fact-checkers to efficiently process and corroborate\nevidence, ensuring the dissemination of accurate information. This research\nunderscores the vital role of computational tools and OSINT techniques in\npromoting evidence-based reporting and combatting misinformation. We also touch\non the current limitations of available tools and prospects for future\ndevelopments in multimedia verification.\n","authors":["Sohail Ahmed Khan","Jan Gunnar Furuly","Henrik Brattli Vold","Rano Tahseen","Duc-Tien Dang-Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.01978v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2208.08723v2","updated":"2023-10-03T05:21:38Z","published":"2022-08-18T09:15:18Z","title":"Disentangled Contrastive Learning for Social Recommendation","summary":"  Social recommendations utilize social relations to enhance the representation\nlearning for recommendations. Most social recommendation models unify user\nrepresentations for the user-item interactions (collaborative domain) and\nsocial relations (social domain). However, such an approach may fail to model\nthe users heterogeneous behavior patterns in two domains, impairing the\nexpressiveness of user representations. In this work, to address such\nlimitation, we propose a novel Disentangled contrastive learning framework for\nsocial Recommendations DcRec. More specifically, we propose to learn\ndisentangled users representations from the item and social domains. Moreover,\ndisentangled contrastive learning is designed to perform knowledge transfer\nbetween disentangled users representations for social recommendations.\nComprehensive experiments on various real-world datasets demonstrate the\nsuperiority of our proposed model.\n","authors":["Jiahao Wu","Wenqi Fan","Jingfan Chen","Shengcai Liu","Qing Li","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2208.08723v2.pdf","comment":"CIKM2022"},{"id":"http://arxiv.org/abs/2310.02367v1","updated":"2023-10-03T18:44:17Z","published":"2023-10-03T18:44:17Z","title":"Linear Recurrent Units for Sequential Recommendation","summary":"  State-of-the-art sequential recommendation relies heavily on\nself-attention-based recommender models. Yet such models are computationally\nexpensive and often too slow for real-time recommendation. Furthermore, the\nself-attention operation is performed at a sequence-level, thereby making\nlow-cost incremental inference challenging. Inspired by recent advances in\nefficient language modeling, we propose linear recurrent units for sequential\nrecommendation (LRURec). Similar to recurrent neural networks, LRURec offers\nrapid inference and can achieve incremental inference on sequential inputs. By\ndecomposing the linear recurrence operation and designing recursive\nparallelization in our framework, LRURec provides the additional benefits of\nreduced model size and parallelizable training. Moreover, we optimize the\narchitecture of LRURec by implementing a series of modifications to address the\nlack of non-linearity and improve training dynamics. To validate the\neffectiveness of our proposed LRURec, we conduct extensive experiments on\nmultiple real-world datasets and compare its performance against\nstate-of-the-art sequential recommenders. Experimental results demonstrate the\neffectiveness of LRURec, which consistently outperforms baselines by a\nsignificant margin. Results also highlight the efficiency of LRURec with our\nparallelized training paradigm and fast inference on long sequences, showing\nits potential to further enhance user experience in sequential recommendation.\n","authors":["Zhenrui Yue","Yueqi Wang","Zhankui He","Huimin Zeng","Julian McAuley","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02294v1","updated":"2023-10-03T09:25:01Z","published":"2023-10-03T09:25:01Z","title":"Beyond-Accuracy: A Review on Diversity, Serendipity and Fairness in\n  Recommender Systems Based on Graph Neural Networks","summary":"  By providing personalized suggestions to users, recommender systems have\nbecome essential to numerous online platforms. Collaborative filtering,\nparticularly graph-based approaches using Graph Neural Networks (GNNs), have\ndemonstrated great results in terms of recommendation accuracy. However,\naccuracy may not always be the most important criterion for evaluating\nrecommender systems' performance, since beyond-accuracy aspects such as\nrecommendation diversity, serendipity, and fairness can strongly influence user\nengagement and satisfaction. This review paper focuses on addressing these\ndimensions in GNN-based recommender systems, going beyond the conventional\naccuracy-centric perspective. We begin by reviewing recent developments in\napproaches that improve not only the accuracy-diversity trade-off but also\npromote serendipity and fairness in GNN-based recommender systems. We discuss\ndifferent stages of model development including data preprocessing, graph\nconstruction, embedding initialization, propagation layers, embedding fusion,\nscore computation, and training methodologies. Furthermore, we present a look\ninto the practical difficulties encountered in assuring diversity, serendipity,\nand fairness, while retaining high accuracy. Finally, we discuss potential\nfuture research directions for developing more robust GNN-based recommender\nsystems that go beyond the unidimensional perspective of focusing solely on\naccuracy. This review aims to provide researchers and practitioners with an\nin-depth understanding of the multifaceted issues that arise when designing\nGNN-based recommender systems, setting our work apart by offering a\ncomprehensive exploration of beyond-accuracy dimensions.\n","authors":["Tomislav Duricic","Dominik Kowald","Emanuel Lacic","Elisabeth Lex"],"pdf_url":"https://arxiv.org/pdf/2310.02294v1.pdf","comment":"14 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2310.03043v1","updated":"2023-10-03T18:45:21Z","published":"2023-10-03T18:45:21Z","title":"A Deep Reinforcement Learning Approach for Interactive Search with\n  Sentence-level Feedback","summary":"  Interactive search can provide a better experience by incorporating\ninteraction feedback from the users. This can significantly improve search\naccuracy as it helps avoid irrelevant information and captures the users'\nsearch intents. Existing state-of-the-art (SOTA) systems use reinforcement\nlearning (RL) models to incorporate the interactions but focus on item-level\nfeedback, ignoring the fine-grained information found in sentence-level\nfeedback. Yet such feedback requires extensive RL action space exploration and\nlarge amounts of annotated data. This work addresses these challenges by\nproposing a new deep Q-learning (DQ) approach, DQrank. DQrank adapts BERT-based\nmodels, the SOTA in natural language processing, to select crucial sentences\nbased on users' engagement and rank the items to obtain more satisfactory\nresponses. We also propose two mechanisms to better explore optimal actions.\nDQrank further utilizes the experience replay mechanism in DQ to store the\nfeedback sentences to obtain a better initial ranking performance. We validate\nthe effectiveness of DQrank on three search datasets. The results show that\nDQRank performs at least 12% better than the previous SOTA RL approaches. We\nalso conduct detailed ablation studies. The ablation results demonstrate that\neach model component can efficiently extract and accumulate long-term\nengagement effects from the users' sentence-level feedback. This structure\noffers new technologies with promised performance to construct a search system\nwith sentence-level interaction.\n","authors":["Jianghong Zhou","Joyce C. Ho","Chen Lin","Eugene Agichtein"],"pdf_url":"https://arxiv.org/pdf/2310.03043v1.pdf","comment":"9 pages, 7 figures, DRL4IR@CIKM"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.02265v1","updated":"2023-10-03T17:59:58Z","published":"2023-10-03T17:59:58Z","title":"DREAM: Visual Decoding from Reversing Human Visual System","summary":"  In this work we present DREAM, an fMRI-to-image method for reconstructing\nviewed images from brain activities, grounded on fundamental knowledge of the\nhuman visual system. We craft reverse pathways that emulate the hierarchical\nand parallel nature of how humans perceive the visual world. These tailored\npathways are specialized to decipher semantics, color, and depth cues from fMRI\ndata, mirroring the forward pathways from visual stimuli to fMRI recordings. To\ndo so, two components mimic the inverse processes within the human visual\nsystem: the Reverse Visual Association Cortex (R-VAC) which reverses pathways\nof this brain region, extracting semantics from fMRI data; the Reverse Parallel\nPKM (R-PKM) component simultaneously predicting color and depth from fMRI\nsignals. The experiments indicate that our method outperforms the current\nstate-of-the-art models in terms of the consistency of appearance, structure,\nand semantics. Code will be made publicly available to facilitate further\nresearch in this field.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2310.02265v1.pdf","comment":"Project Page: https://weihaox.github.io/DREAM"},{"id":"http://arxiv.org/abs/2310.02263v1","updated":"2023-10-03T17:59:46Z","published":"2023-10-03T17:59:46Z","title":"Contrastive Post-training Large Language Models on Data Curriculum","summary":"  Alignment serves as an important step to steer large language models (LLMs)\ntowards human preferences. In this paper, we explore contrastive post-training\ntechniques for alignment by automatically constructing preference pairs from\nmultiple models of varying strengths (e.g., InstructGPT, ChatGPT and GPT-4). We\ncarefully compare the contrastive techniques of SLiC and DPO to SFT baselines\nand find that DPO provides a step-function improvement even after continueing\nSFT saturates. We also explore a data curriculum learning scheme for\ncontrastive post-training, which starts by learning from \"easier\" pairs and\ntransitioning to \"harder\" ones, which further improves alignment. Finally, we\nscale up our experiments to train with more data and larger models like Orca.\nRemarkably, contrastive post-training further improves the performance of Orca,\nalready a state-of-the-art instruction learning model tuned with GPT-4 outputs,\nto exceed that of ChatGPT.\n","authors":["Canwen Xu","Corby Rosset","Luciano Del Corro","Shweti Mahajan","Julian McAuley","Jennifer Neville","Ahmed Hassan Awadallah","Nikhil Rao"],"pdf_url":"https://arxiv.org/pdf/2310.02263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02264v1","updated":"2023-10-03T17:59:46Z","published":"2023-10-03T17:59:46Z","title":"Generalizable Long-Horizon Manipulations with Large Language Models","summary":"  This work introduces a framework harnessing the capabilities of Large\nLanguage Models (LLMs) to generate primitive task conditions for generalizable\nlong-horizon manipulations with novel objects and unseen tasks. These task\nconditions serve as guides for the generation and adjustment of Dynamic\nMovement Primitives (DMP) trajectories for long-horizon task execution. We\nfurther create a challenging robotic manipulation task suite based on Pybullet\nfor long-horizon task evaluation. Extensive experiments in both simulated and\nreal-world environments demonstrate the effectiveness of our framework on both\nfamiliar tasks involving new objects and novel but related tasks, highlighting\nthe potential of LLMs in enhancing robotic system versatility and adaptability.\nProject website: https://object814.github.io/Task-Condition-With-LLM/\n","authors":["Haoyu Zhou","Mingyu Ding","Weikun Peng","Masayoshi Tomizuka","Lin Shao","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2310.02264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02258v1","updated":"2023-10-03T17:58:33Z","published":"2023-10-03T17:58:33Z","title":"A Neural Scaling Law from Lottery Ticket Ensembling","summary":"  Neural scaling laws (NSL) refer to the phenomenon where model performance\nimproves with scale. Sharma & Kaplan analyzed NSL using approximation theory\nand predict that MSE losses decay as $N^{-\\alpha}$, $\\alpha=4/d$, where $N$ is\nthe number of model parameters, and $d$ is the intrinsic input dimension.\nAlthough their theory works well for some cases (e.g., ReLU networks), we\nsurprisingly find that a simple 1D problem $y=x^2$ manifests a different\nscaling law ($\\alpha=1$) from their predictions ($\\alpha=4$). We opened the\nneural networks and found that the new scaling law originates from lottery\nticket ensembling: a wider network on average has more \"lottery tickets\", which\nare ensembled to reduce the variance of outputs. We support the ensembling\nmechanism by mechanistically interpreting single neural networks, as well as\nstudying them statistically. We attribute the $N^{-1}$ scaling law to the\n\"central limit theorem\" of lottery tickets. Finally, we discuss its potential\nimplications for large language models and statistical physics-type theories of\nlearning.\n","authors":["Ziming Liu","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2310.02258v1.pdf","comment":"14 pages, 13 figures"},{"id":"http://arxiv.org/abs/2310.02255v1","updated":"2023-10-03T17:57:24Z","published":"2023-10-03T17:57:24Z","title":"MathVista: Evaluating Mathematical Reasoning of Foundation Models in\n  Visual Contexts","summary":"  Although Large Language Models (LLMs) and Large Multimodal Models (LMMs)\nexhibit impressive skills in various domains, their ability for mathematical\nreasoning within visual contexts has not been formally examined. Equipping LLMs\nand LMMs with this capability is vital for general-purpose AI assistants and\nshowcases promising potential in education, data analysis, and scientific\ndiscovery. To bridge this gap, we present MathVista, a benchmark designed to\namalgamate challenges from diverse mathematical and visual tasks. We first\ntaxonomize the key task types, reasoning skills, and visual contexts from the\nliterature to guide our selection from 28 existing math-focused and visual\nquestion answering datasets. Then, we construct three new datasets, IQTest,\nFunctionQA, and PaperQA, to accommodate for missing types of visual contexts.\nThe problems featured often require deep visual understanding beyond OCR or\nimage captioning, and compositional reasoning with rich domain-specific tools,\nthus posing a notable challenge to existing models. We conduct a comprehensive\nevaluation of 11 prominent open-source and proprietary foundation models (LLMs,\nLLMs augmented with tools, and LMMs), and early experiments with GPT-4V. The\nbest-performing model, Multimodal Bard, achieves only 58% of human performance\n(34.8% vs 60.3%), indicating ample room for further improvement. Given this\nsignificant gap, MathVista fuels future research in the development of\ngeneral-purpose AI agents capable of tackling mathematically intensive and\nvisually rich real-world tasks. Preliminary tests show that MathVista also\npresents challenges to GPT-4V, underscoring the benchmark's importance. The\nproject is available at https://mathvista.github.io/.\n","authors":["Pan Lu","Hritik Bansal","Tony Xia","Jiacheng Liu","Chunyuan Li","Hannaneh Hajishirzi","Hao Cheng","Kai-Wei Chang","Michel Galley","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.02255v1.pdf","comment":"51 pages, 56 figures. Work in progress"},{"id":"http://arxiv.org/abs/2310.02254v1","updated":"2023-10-03T17:56:07Z","published":"2023-10-03T17:56:07Z","title":"Learning unitaries with quantum statistical queries","summary":"  We propose several algorithms for learning unitary operators from quantum\nstatistical queries (QSQs) with respect to their Choi-Jamiolkowski state.\nQuantum statistical queries capture the capabilities of a learner with limited\nquantum resources, which receives as input only noisy estimates of expected\nvalues of measurements. Our methods hinge on a novel technique for estimating\nthe Fourier mass of a unitary on a subset of Pauli strings with a single\nquantum statistical query, generalizing a previous result for uniform quantum\nexamples. Exploiting this insight, we show that the quantum Goldreich-Levin\nalgorithm can be implemented with quantum statistical queries, whereas the\nprior version of the algorithm involves oracle access to the unitary and its\ninverse. Moreover, we prove that $\\mathcal{O}(\\log n)$-juntas and quantum\nBoolean functions with constant total influence are efficiently learnable in\nour model, and constant-depth circuits are learnable sample-efficiently with\nquantum statistical queries. On the other hand, all previous algorithms for\nthese tasks require direct access to the Choi-Jamiolkowski state or oracle\naccess to the unitary. In addition, our upper bounds imply that the actions of\nthose classes of unitaries on locally scrambled ensembles can be efficiently\nlearned. We also demonstrate that, despite these positive results, quantum\nstatistical queries lead to an exponentially larger sample complexity for\ncertain tasks, compared to separable measurements to the Choi-Jamiolkowski\nstate. In particular, we show an exponential lower bound for learning a class\nof phase-oracle unitaries and a double exponential lower bound for testing the\nunitarity of channels, adapting to our setting previous arguments for quantum\nstates. Finally, we propose a new definition of average-case surrogate models,\nshowing a potential application of our results to hybrid quantum machine\nlearning.\n","authors":["Armando Angrisani"],"pdf_url":"https://arxiv.org/pdf/2310.02254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02250v1","updated":"2023-10-03T17:53:43Z","published":"2023-10-03T17:53:43Z","title":"Why do autoencoders work?","summary":"  Deep neural network autoencoders are routinely used computationally for model\nreduction. They allow recognizing the intrinsic dimension of data that lie in a\n$k$-dimensional subset $K$ of an input Euclidean space $\\R^n$. The underlying\nidea is to obtain both an encoding layer that maps $\\R^n$ into $\\R^k$ (called\nthe bottleneck layer or the space of latent variables) and a decoding layer\nthat maps $\\R^k$ back into $\\R^n$, in such a way that the input data from the\nset $K$ is recovered when composing the two maps. This is achieved by adjusting\nparameters (weights) in the network to minimize the discrepancy between the\ninput and the reconstructed output. Since neural networks (with continuous\nactivation functions) compute continuous maps, the existence of a network that\nachieves perfect reconstruction would imply that $K$ is homeomorphic to a\n$k$-dimensional subset of $\\R^k$, so clearly there are topological obstructions\nto finding such a network. On the other hand, in practice the technique is\nfound to ``work'' well, which leads one to ask if there is a way to explain\nthis effectiveness. We show that, up to small errors, indeed the method is\nguaranteed to work. This is done by appealing to certain facts from\ndifferential geometry. A computational example is also included to illustrate\nthe ideas.\n","authors":["Matthew D. Kvalheim","Eduardo D. Sontag"],"pdf_url":"https://arxiv.org/pdf/2310.02250v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.02249v1","updated":"2023-10-03T17:53:09Z","published":"2023-10-03T17:53:09Z","title":"Harnessing Pre-Trained Sentence Transformers for Offensive Language\n  Detection in Indian Languages","summary":"  In our increasingly interconnected digital world, social media platforms have\nemerged as powerful channels for the dissemination of hate speech and offensive\ncontent. This work delves into the domain of hate speech detection, placing\nspecific emphasis on three low-resource Indian languages: Bengali, Assamese,\nand Gujarati. The challenge is framed as a text classification task, aimed at\ndiscerning whether a tweet contains offensive or non-offensive content.\nLeveraging the HASOC 2023 datasets, we fine-tuned pre-trained BERT and SBERT\nmodels to evaluate their effectiveness in identifying hate speech. Our findings\nunderscore the superiority of monolingual sentence-BERT models, particularly in\nthe Bengali language, where we achieved the highest ranking. However, the\nperformance in Assamese and Gujarati languages signifies ongoing opportunities\nfor enhancement. Our goal is to foster inclusive online spaces by countering\nhate speech proliferation.\n","authors":["Ananya Joshi","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2310.02249v1.pdf","comment":"HASOC at FIRE 2023"},{"id":"http://arxiv.org/abs/2310.02246v1","updated":"2023-10-03T17:51:42Z","published":"2023-10-03T17:51:42Z","title":"Learning to Relax: Setting Solver Parameters Across a Sequence of Linear\n  System Instances","summary":"  Solving a linear system $Ax=b$ is a fundamental scientific computing\nprimitive for which numerous solvers and preconditioners have been developed.\nThese come with parameters whose optimal values depend on the system being\nsolved and are often impossible or too expensive to identify; thus in practice\nsub-optimal heuristics are used. We consider the common setting in which many\nrelated linear systems need to be solved, e.g. during a single numerical\nsimulation. In this scenario, can we sequentially choose parameters that attain\na near-optimal overall number of iterations, without extra matrix computations?\nWe answer in the affirmative for Successive Over-Relaxation (SOR), a standard\nsolver whose parameter $\\omega$ has a strong impact on its runtime. For this\nmethod, we prove that a bandit online learning algorithm -- using only the\nnumber of iterations as feedback -- can select parameters for a sequence of\ninstances such that the overall cost approaches that of the best fixed $\\omega$\nas the sequence length increases. Furthermore, when given additional structural\ninformation, we show that a contextual bandit method asymptotically achieves\nthe performance of the instance-optimal policy, which selects the best $\\omega$\nfor each instance. Our work provides the first learning-theoretic treatment of\nhigh-precision linear system solvers and the first end-to-end guarantees for\ndata-driven scientific computing, demonstrating theoretically the potential to\nspeed up numerical methods using well-understood learning algorithms.\n","authors":["Mikhail Khodak","Edmond Chow","Maria-Florina Balcan","Ameet Talwalkar"],"pdf_url":"https://arxiv.org/pdf/2310.02246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02243v1","updated":"2023-10-03T17:50:26Z","published":"2023-10-03T17:50:26Z","title":"Learning quantum Hamiltonians at any temperature in polynomial time","summary":"  We study the problem of learning a local quantum Hamiltonian $H$ given copies\nof its Gibbs state $\\rho = e^{-\\beta H}/\\textrm{tr}(e^{-\\beta H})$ at a known\ninverse temperature $\\beta>0$. Anshu, Arunachalam, Kuwahara, and Soleimanifar\n(arXiv:2004.07266) gave an algorithm to learn a Hamiltonian on $n$ qubits to\nprecision $\\epsilon$ with only polynomially many copies of the Gibbs state, but\nwhich takes exponential time. Obtaining a computationally efficient algorithm\nhas been a major open problem [Alhambra'22 (arXiv:2204.08349)], [Anshu,\nArunachalam'22 (arXiv:2204.08349)], with prior work only resolving this in the\nlimited cases of high temperature [Haah, Kothari, Tang'21 (arXiv:2108.04842)]\nor commuting terms [Anshu, Arunachalam, Kuwahara, Soleimanifar'21]. We fully\nresolve this problem, giving a polynomial time algorithm for learning $H$ to\nprecision $\\epsilon$ from polynomially many copies of the Gibbs state at any\nconstant $\\beta > 0$.\n  Our main technical contribution is a new flat polynomial approximation to the\nexponential function, and a translation between multi-variate scalar\npolynomials and nested commutators. This enables us to formulate Hamiltonian\nlearning as a polynomial system. We then show that solving a low-degree\nsum-of-squares relaxation of this polynomial system suffices to accurately\nlearn the Hamiltonian.\n","authors":["Ainesh Bakshi","Allen Liu","Ankur Moitra","Ewin Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02237v1","updated":"2023-10-03T17:47:25Z","published":"2023-10-03T17:47:25Z","title":"Exploring Model Learning Heterogeneity for Boosting Ensemble Robustness","summary":"  Deep neural network ensembles hold the potential of improving generalization\nperformance for complex learning tasks. This paper presents formal analysis and\nempirical evaluation to show that heterogeneous deep ensembles with high\nensemble diversity can effectively leverage model learning heterogeneity to\nboost ensemble robustness. We first show that heterogeneous DNN models trained\nfor solving the same learning problem, e.g., object detection, can\nsignificantly strengthen the mean average precision (mAP) through our weighted\nbounding box ensemble consensus method. Second, we further compose ensembles of\nheterogeneous models for solving different learning problems, e.g., object\ndetection and semantic segmentation, by introducing the connected component\nlabeling (CCL) based alignment. We show that this two-tier heterogeneity driven\nensemble construction method can compose an ensemble team that promotes high\nensemble diversity and low negative correlation among member models of the\nensemble, strengthening ensemble robustness against both negative examples and\nadversarial attacks. Third, we provide a formal analysis of the ensemble\nrobustness in terms of negative correlation. Extensive experiments validate the\nenhanced robustness of heterogeneous ensembles in both benign and adversarial\nsettings. The source codes are available on GitHub at\nhttps://github.com/git-disl/HeteRobust.\n","authors":["Yanzhao Wu","Ka-Ho Chow","Wenqi Wei","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02237v1.pdf","comment":"Accepted by IEEE ICDM 2023"},{"id":"http://arxiv.org/abs/2310.02235v1","updated":"2023-10-03T17:45:39Z","published":"2023-10-03T17:45:39Z","title":"Automatic Quality Assessment of Wikipedia Articles -- A Systematic\n  Literature Review","summary":"  Wikipedia is the world's largest online encyclopedia, but maintaining article\nquality through collaboration is challenging. Wikipedia designed a quality\nscale, but with such a manual assessment process, many articles remain\nunassessed. We review existing methods for automatically measuring the quality\nof Wikipedia articles, identifying and comparing machine learning algorithms,\narticle features, quality metrics, and used datasets, examining 149 distinct\nstudies, and exploring commonalities and gaps in them. The literature is\nextensive, and the approaches follow past technological trends. However,\nmachine learning is still not widely used by Wikipedia, and we hope that our\nanalysis helps future researchers change that reality.\n","authors":["Pedro Miguel Moás","Carla Teixeira Lopes"],"pdf_url":"https://arxiv.org/pdf/2310.02235v1.pdf","comment":"37 pages, 10 figures, just accepted in ACM Computing Surveys\n  (September 2023). This is the author's version of the work. It is posted here\n  for your personal use. Not for redistribution. The definitive Version of\n  Record was published in ACM Computing Surveys,\n  https://dx.doi.org/10.1145/3625286"},{"id":"http://arxiv.org/abs/2310.02234v1","updated":"2023-10-03T17:43:24Z","published":"2023-10-03T17:43:24Z","title":"MIS-AVioDD: Modality Invariant and Specific Representation for\n  Audio-Visual Deepfake Detection","summary":"  Deepfakes are synthetic media generated using deep generative algorithms and\nhave posed a severe societal and political threat. Apart from facial\nmanipulation and synthetic voice, recently, a novel kind of deepfakes has\nemerged with either audio or visual modalities manipulated. In this regard, a\nnew generation of multimodal audio-visual deepfake detectors is being\ninvestigated to collectively focus on audio and visual data for multimodal\nmanipulation detection. Existing multimodal (audio-visual) deepfake detectors\nare often based on the fusion of the audio and visual streams from the video.\nExisting studies suggest that these multimodal detectors often obtain\nequivalent performances with unimodal audio and visual deepfake detectors. We\nconjecture that the heterogeneous nature of the audio and visual signals\ncreates distributional modality gaps and poses a significant challenge to\neffective fusion and efficient performance. In this paper, we tackle the\nproblem at the representation level to aid the fusion of audio and visual\nstreams for multimodal deepfake detection. Specifically, we propose the joint\nuse of modality (audio and visual) invariant and specific representations. This\nensures that the common patterns and patterns specific to each modality\nrepresenting pristine or fake content are preserved and fused for multimodal\ndeepfake manipulation detection. Our experimental results on FakeAVCeleb and\nKoDF audio-visual deepfake datasets suggest the enhanced accuracy of our\nproposed method over SOTA unimodal and multimodal audio-visual deepfake\ndetectors by $17.8$% and $18.4$%, respectively. Thus, obtaining\nstate-of-the-art performance.\n","authors":["Vinaya Sree Katamneni","Ajita Rattani"],"pdf_url":"https://arxiv.org/pdf/2310.02234v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.02233v1","updated":"2023-10-03T17:42:11Z","published":"2023-10-03T17:42:11Z","title":"Generalized Schrödinger Bridge Matching","summary":"  Modern distribution matching algorithms for training diffusion or flow models\ndirectly prescribe the time evolution of the marginal distributions between two\nboundary distributions. In this work, we consider a generalized distribution\nmatching setup, where these marginals are only implicitly described as a\nsolution to some task-specific objective function. The problem setup, known as\nthe Generalized Schr\\\"odinger Bridge (GSB), appears prevalently in many\nscientific areas both within and without machine learning. We propose\nGeneralized Schr\\\"odinger Bridge Matching (GSBM), a new matching algorithm\ninspired by recent advances, generalizing them beyond kinetic energy\nminimization and to account for task-specific state costs. We show that such a\ngeneralization can be cast as solving conditional stochastic optimal control,\nfor which efficient variational approximations can be used, and further\ndebiased with the aid of path integral theory. Compared to prior methods for\nsolving GSB problems, our GSBM algorithm always preserves a feasible transport\nmap between the boundary distributions throughout training, thereby enabling\nstable convergence and significantly improved scalability. We empirically\nvalidate our claims on an extensive suite of experimental setups, including\ncrowd navigation, opinion depolarization, LiDAR manifolds, and image domain\ntransfer. Our work brings new algorithmic opportunities for training diffusion\nmodels enhanced with task-specific optimality structures.\n","authors":["Guan-Horng Liu","Yaron Lipman","Maximilian Nickel","Brian Karrer","Evangelos A. Theodorou","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02232v1","updated":"2023-10-03T17:42:09Z","published":"2023-10-03T17:42:09Z","title":"HoloNets: Spectral Convolutions do extend to Directed Graphs","summary":"  Within the graph learning community, conventional wisdom dictates that\nspectral convolutional networks may only be deployed on undirected graphs: Only\nthere could the existence of a well-defined graph Fourier transform be\nguaranteed, so that information may be translated between spatial- and spectral\ndomains. Here we show this traditional reliance on the graph Fourier transform\nto be superfluous and -- making use of certain advanced tools from complex\nanalysis and spectral theory -- extend spectral convolutions to directed\ngraphs. We provide a frequency-response interpretation of newly developed\nfilters, investigate the influence of the basis used to express filters and\ndiscuss the interplay with characteristic operators on which networks are\nbased. In order to thoroughly test the developed theory, we conduct experiments\nin real world settings, showcasing that directed spectral convolutional\nnetworks provide new state of the art results for heterophilic node\nclassification on many datasets and -- as opposed to baselines -- may be\nrendered stable to resolution-scale varying topological perturbations.\n","authors":["Christian Koke","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2310.02232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02227v1","updated":"2023-10-03T17:32:44Z","published":"2023-10-03T17:32:44Z","title":"SNIP: Bridging Mathematical Symbolic and Numeric Realms with Unified\n  Pre-training","summary":"  In an era where symbolic mathematical equations are indispensable for\nmodeling complex natural phenomena, scientific inquiry often involves\ncollecting observations and translating them into mathematical expressions.\nRecently, deep learning has emerged as a powerful tool for extracting insights\nfrom data. However, existing models typically specialize in either numeric or\nsymbolic domains, and are usually trained in a supervised manner tailored to\nspecific tasks. This approach neglects the substantial benefits that could\narise from a task-agnostic unified understanding between symbolic equations and\ntheir numeric counterparts. To bridge the gap, we introduce SNIP, a\nSymbolic-Numeric Integrated Pre-training, which employs joint contrastive\nlearning between symbolic and numeric domains, enhancing their mutual\nsimilarities in the pre-trained embeddings. By performing latent space\nanalysis, we observe that SNIP provides cross-domain insights into the\nrepresentations, revealing that symbolic supervision enhances the embeddings of\nnumeric data and vice versa. We evaluate SNIP across diverse tasks, including\nsymbolic-to-numeric mathematical property prediction and numeric-to-symbolic\nequation discovery, commonly known as symbolic regression. Results show that\nSNIP effectively transfers to various tasks, consistently outperforming fully\nsupervised baselines and competing strongly with established task-specific\nmethods, especially in few-shot learning scenarios where available data is\nlimited.\n","authors":["Kazem Meidani","Parshin Shojaee","Chandan K. Reddy","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2310.02227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02226v1","updated":"2023-10-03T17:32:41Z","published":"2023-10-03T17:32:41Z","title":"Think before you speak: Training Language Models With Pause Tokens","summary":"  Language models generate responses by producing a series of tokens in\nimmediate succession: the $(K+1)^{th}$ token is an outcome of manipulating $K$\nhidden vectors per layer, one vector per preceding token. What if instead we\nwere to let the model manipulate say, $K+10$ hidden vectors, before it outputs\nthe $(K+1)^{th}$ token? We operationalize this idea by performing training and\ninference on language models with a (learnable) $\\textit{pause}$ token, a\nsequence of which is appended to the input prefix. We then delay extracting the\nmodel's outputs until the last pause token is seen, thereby allowing the model\nto process extra computation before committing to an answer. We empirically\nevaluate $\\textit{pause-training}$ on decoder-only models of 1B and 130M\nparameters with causal pretraining on C4, and on downstream tasks covering\nreasoning, question-answering, general understanding and fact recall. Our main\nfinding is that inference-time delays show gains when the model is both\npre-trained and finetuned with delays. For the 1B model, we witness gains on 8\nof 9 tasks, most prominently, a gain of $18\\%$ EM score on the QA task of\nSQuAD, $8\\%$ on CommonSenseQA and $1\\%$ accuracy on the reasoning task of\nGSM8k. Our work raises a range of conceptual and practical future research\nquestions on making delayed next-token prediction a widely applicable new\nparadigm.\n","authors":["Sachin Goyal","Ziwei Ji","Ankit Singh Rawat","Aditya Krishna Menon","Sanjiv Kumar","Vaishnavh Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2310.02226v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.02221v1","updated":"2023-10-03T17:27:30Z","published":"2023-10-03T17:27:30Z","title":"Structurally guided task decomposition in spatial navigation tasks","summary":"  How are people able to plan so efficiently despite limited cognitive\nresources? We aimed to answer this question by extending an existing model of\nhuman task decomposition that can explain a wide range of simple planning\nproblems by adding structure information to the task to facilitate planning in\nmore complex tasks. The extended model was then applied to a more complex\nplanning domain of spatial navigation. Our results suggest that our framework\ncan correctly predict the navigation strategies of the majority of the\nparticipants in an online experiment.\n","authors":["Ruiqi He","Carlos G. Correa","Thomas L. Griffiths","Mark K. Ho"],"pdf_url":"https://arxiv.org/pdf/2310.02221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02219v1","updated":"2023-10-03T17:27:10Z","published":"2023-10-03T17:27:10Z","title":"What do we learn from a large-scale study of pre-trained visual\n  representations in sim and real environments?","summary":"  We present a large empirical investigation on the use of pre-trained visual\nrepresentations (PVRs) for training downstream policies that execute real-world\ntasks. Our study spans five different PVRs, two different policy-learning\nparadigms (imitation and reinforcement learning), and three different robots\nfor 5 distinct manipulation and indoor navigation tasks. From this effort, we\ncan arrive at three insights: 1) the performance trends of PVRs in the\nsimulation are generally indicative of their trends in the real world, 2) the\nuse of PVRs enables a first-of-its-kind result with indoor ImageNav (zero-shot\ntransfer to a held-out scene in the real world), and 3) the benefits from\nvariations in PVRs, primarily data-augmentation and fine-tuning, also transfer\nto the real-world performance. See project website for additional details and\nvisuals.\n","authors":["Sneha Silwal","Karmesh Yadav","Tingfan Wu","Jay Vakil","Arjun Majumdar","Sergio Arnaud","Claire Chen","Vincent-Pierre Berges","Dhruv Batra","Aravind Rajeswaran","Mrinal Kalakrishnan","Franziska Meier","Oleksandr Maksymets"],"pdf_url":"https://arxiv.org/pdf/2310.02219v1.pdf","comment":"Project website https://pvrs-sim2real.github.io/"},{"id":"http://arxiv.org/abs/2310.02215v1","updated":"2023-10-03T17:17:44Z","published":"2023-10-03T17:17:44Z","title":"An experimental system for detection and localization of hemorrhage\n  using ultra-wideband microwaves with deep learning","summary":"  Stroke is a leading cause of mortality and disability. Emergent diagnosis and\nintervention are critical, and predicated upon initial brain imaging; however,\nexisting clinical imaging modalities are generally costly, immobile, and demand\nhighly specialized operation and interpretation. Low-energy microwaves have\nbeen explored as low-cost, small form factor, fast, and safe probes of tissue\ndielectric properties, with both imaging and diagnostic potential.\nNevertheless, challenges inherent to microwave reconstruction have impeded\nprogress, hence microwave imaging (MWI) remains an elusive scientific aim.\nHerein, we introduce a dedicated experimental framework comprising a robotic\nnavigation system to translate blood-mimicking phantoms within an anatomically\nrealistic human head model. An 8-element ultra-wideband (UWB) array of modified\nantipodal Vivaldi antennas was developed and driven by a two-port vector\nnetwork analyzer spanning 0.6-9.0 GHz at an operating power of 1 mw. Complex\nscattering parameters were measured, and dielectric signatures of hemorrhage\nwere learned using a dedicated deep neural network for prediction of hemorrhage\nclasses and localization. An overall sensitivity and specificity for detection\n>0.99 was observed, with Rayliegh mean localization error of 1.65 mm. The study\nestablishes the feasibility of a robust experimental model and deep learning\nsolution for UWB microwave stroke detection.\n","authors":["Eisa Hedayati","Fatemeh Safari","George Verghese","Vito R. Ciancia","Daniel K. Sodickson","Seena Dehkharghani","Leeor Alon"],"pdf_url":"https://arxiv.org/pdf/2310.02215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02207v1","updated":"2023-10-03T17:06:52Z","published":"2023-10-03T17:06:52Z","title":"Language Models Represent Space and Time","summary":"  The capabilities of large language models (LLMs) have sparked debate over\nwhether such systems just learn an enormous collection of superficial\nstatistics or a coherent model of the data generating process -- a world model.\nWe find evidence for the latter by analyzing the learned representations of\nthree spatial datasets (world, US, NYC places) and three temporal datasets\n(historical figures, artworks, news headlines) in the Llama-2 family of models.\nWe discover that LLMs learn linear representations of space and time across\nmultiple scales. These representations are robust to prompting variations and\nunified across different entity types (e.g. cities and landmarks). In addition,\nwe identify individual ``space neurons'' and ``time neurons'' that reliably\nencode spatial and temporal coordinates. Our analysis demonstrates that modern\nLLMs acquire structured knowledge about fundamental dimensions such as space\nand time, supporting the view that they learn not merely superficial\nstatistics, but literal world models.\n","authors":["Wes Gurnee","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2310.02207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02206v1","updated":"2023-10-03T17:04:33Z","published":"2023-10-03T17:04:33Z","title":"Chunking: Forgetting Matters in Continual Learning even without Changing\n  Tasks","summary":"  Work on continual learning (CL) has largely focused on the problems arising\nfrom the dynamically-changing data distribution. However, CL can be decomposed\ninto two sub-problems: (a) shifts in the data distribution, and (b) dealing\nwith the fact that the data is split into chunks and so only a part of the data\nis available to be trained on at any point in time. In this work, we look at\nthe latter sub-problem -- the chunking of data -- and note that previous\nanalysis of chunking in the CL literature is sparse. We show that chunking is\nan important part of CL, accounting for around half of the performance drop\nfrom offline learning in our experiments. Furthermore, our results reveal that\ncurrent CL algorithms do not address the chunking sub-problem, only performing\nas well as plain SGD training when there is no shift in the data distribution.\nWe analyse why performance drops when learning occurs on chunks of data, and\nfind that forgetting, which is often seen to be a problem due to distribution\nshift, still arises and is a significant problem. Motivated by an analysis of\nthe linear case, we show that per-chunk weight averaging improves performance\nin the chunking setting and that this performance transfers to the full CL\nsetting. Hence, we argue that work on chunking can help advance CL in general.\n","authors":["Thomas L. Lee","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2310.02206v1.pdf","comment":"9 pages, 11 figures, preprint"},{"id":"http://arxiv.org/abs/2307.10490v4","updated":"2023-10-03T17:03:10Z","published":"2023-07-19T23:03:20Z","title":"Abusing Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.05825v4","updated":"2023-10-03T17:00:23Z","published":"2020-08-13T11:28:57Z","title":"Unifying supervised learning and VAEs -- coverage, systematics and\n  goodness-of-fit in normalizing-flow based neural network models for\n  astro-particle reconstructions","summary":"  Neural-network based predictions of event properties in astro-particle\nphysics are getting more and more common. However, in many cases the result is\njust utilized as a point prediction. Statistical uncertainties and coverage\n(1), systematic uncertainties (2) or a goodness-of-fit measure (3) are often\nnot calculated. Here we describe a certain choice of training and network\narchitecture that allows to incorporate all these properties into a single\nnetwork model. We show that a KL-divergence objective of the joint distribution\nof data and labels allows to unify supervised learning and variational\nautoencoders (VAEs) under one umbrella of stochastic variational inference. The\nunification motivates an extended supervised learning scheme which allows to\ncalculate a goodness-of-fit p-value for the neural network model. Conditional\nnormalizing flows amortized with a neural network are crucial in this\nconstruction. We discuss how they allow to rigorously define coverage for\nposteriors defined jointly on a product space, e.g. $\\mathbb{R}^n \\times\n\\mathcal{S}^m$, which encompasses posteriors over directions. Finally,\nsystematic uncertainties are naturally included in the variational viewpoint.\nThe proposed extended supervised training with amortized normalizing flows\nincorporates (1) coverage calculation, (2) systematics and (3) a\ngoodness-of-fit measure in a single machine-learning model. There are no\nconstraints on the shape of the involved distributions (e.g. Gaussianity) for\nthese properties to hold, in fact it works with complex multi-modal\ndistributions defined on product spaces like $\\mathbb{R}^n \\times\n\\mathcal{S}^m$. We see great potential for exploiting this per-event\ninformation in event selections or for fast astronomical alerts which require\nuncertainty guarantees.\n","authors":["Thorsten Glüsenkamp"],"pdf_url":"https://arxiv.org/pdf/2008.05825v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.12478v2","updated":"2023-10-03T16:53:05Z","published":"2020-11-25T01:57:51Z","title":"Minimax Estimation of Distances on a Surface and Minimax Manifold\n  Learning in the Isometric-to-Convex Setting","summary":"  We start by considering the problem of estimating intrinsic distances on a\nsmooth submanifold. We show that minimax optimality can be obtained via a\nreconstruction of the surface, and discuss the use of a particular mesh\nconstruction -- the tangential Delaunay complex -- for that purpose. We then\nturn to manifold learning and argue that a variant of Isomap where the\ndistances are instead computed on a reconstructed surface is minimax optimal\nfor the isometric variant of the problem.\n","authors":["Ery Arias-Castro","Phong Alain Chau"],"pdf_url":"https://arxiv.org/pdf/2011.12478v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11838v4","updated":"2023-10-03T16:46:16Z","published":"2022-08-25T02:58:23Z","title":"Learning Task Automata for Reinforcement Learning using Hidden Markov\n  Models","summary":"  Training reinforcement learning (RL) agents using scalar reward signals is\noften infeasible when an environment has sparse and non-Markovian rewards.\nMoreover, handcrafting these reward functions before training is prone to\nmisspecification, especially when the environment's dynamics are only partially\nknown. This paper proposes a novel pipeline for learning non-Markovian task\nspecifications as succinct finite-state `task automata' from episodes of agent\nexperience within unknown environments. We leverage two key algorithmic\ninsights. First, we learn a product MDP, a model composed of the\nspecification's automaton and the environment's MDP (both initially unknown),\nby treating the product MDP as a partially observable MDP and using the\nwell-known Baum-Welch algorithm for learning hidden Markov models. Second, we\npropose a novel method for distilling the task automaton (assumed to be a\ndeterministic finite automaton) from the learnt product MDP. Our learnt task\nautomaton enables the decomposition of a task into its constituent sub-tasks,\nwhich improves the rate at which an RL agent can later synthesise an optimal\npolicy. It also provides an interpretable encoding of high-level environmental\nand task features, so a human can readily verify that the agent has learnt\ncoherent tasks with no misspecifications. In addition, we take steps towards\nensuring that the learnt automaton is environment-agnostic, making it\nwell-suited for use in transfer learning. Finally, we provide experimental\nresults compared with two baselines to illustrate our algorithm's performance\nin different environments and tasks.\n","authors":["Alessandro Abate","Yousif Almulla","James Fox","David Hyland","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2208.11838v4.pdf","comment":"14 pages, 7 figures, Accepted to the 26th European Conference on\n  Artificial Intelligence (ECAI 2023)"},{"id":"http://arxiv.org/abs/2310.01307v2","updated":"2023-10-03T16:40:35Z","published":"2023-10-02T16:13:08Z","title":"On the Generalization of Training-based ChatGPT Detection Methods","summary":"  ChatGPT is one of the most popular language models which achieve amazing\nperformance on various natural language tasks. Consequently, there is also an\nurgent need to detect the texts generated ChatGPT from human written. One of\nthe extensively studied methods trains classification models to distinguish\nboth. However, existing studies also demonstrate that the trained models may\nsuffer from distribution shifts (during test), i.e., they are ineffective to\npredict the generated texts from unseen language tasks or topics. In this work,\nwe aim to have a comprehensive investigation on these methods' generalization\nbehaviors under distribution shift caused by a wide range of factors, including\nprompts, text lengths, topics, and language tasks. To achieve this goal, we\nfirst collect a new dataset with human and ChatGPT texts, and then we conduct\nextensive studies on the collected dataset. Our studies unveil insightful\nfindings which provide guidance for developing future methodologies or data\ncollection strategies for ChatGPT detection.\n","authors":["Han Xu","Jie Ren","Pengfei He","Shenglai Zeng","Yingqian Cui","Amy Liu","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.01307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02193v1","updated":"2023-10-03T16:39:21Z","published":"2023-10-03T16:39:21Z","title":"Uncertainty Quantification in Inverse Models in Hydrology","summary":"  In hydrology, modeling streamflow remains a challenging task due to the\nlimited availability of basin characteristics information such as soil geology\nand geomorphology. These characteristics may be noisy due to measurement errors\nor may be missing altogether. To overcome this challenge, we propose a\nknowledge-guided, probabilistic inverse modeling method for recovering physical\ncharacteristics from streamflow and weather data, which are more readily\navailable. We compare our framework with state-of-the-art inverse models for\nestimating river basin characteristics. We also show that these estimates offer\nimprovement in streamflow modeling as opposed to using the original basin\ncharacteristic values. Our inverse model offers 3\\% improvement in R$^2$ for\nthe inverse model (basin characteristic estimation) and 6\\% for the forward\nmodel (streamflow prediction). Our framework also offers improved\nexplainability since it can quantify uncertainty in both the inverse and the\nforward model. Uncertainty quantification plays a pivotal role in improving the\nexplainability of machine learning models by providing additional insights into\nthe reliability and limitations of model predictions. In our analysis, we\nassess the quality of the uncertainty estimates. Compared to baseline\nuncertainty quantification methods, our framework offers 10\\% improvement in\nthe dispersion of epistemic uncertainty and 13\\% improvement in coverage rate.\nThis information can help stakeholders understand the level of uncertainty\nassociated with the predictions and provide a more comprehensive view of the\npotential outcomes.\n","authors":["Somya Sharma Chatterjee","Rahul Ghosh","Arvind Renganathan","Xiang Li","Snigdhansu Chatterjee","John Nieber","Christopher Duffy","Vipin Kumar"],"pdf_url":"https://arxiv.org/pdf/2310.02193v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2210.06213"},{"id":"http://arxiv.org/abs/2211.13976v4","updated":"2023-10-03T16:13:56Z","published":"2022-11-25T09:38:22Z","title":"Expanding Small-Scale Datasets with Guided Imagination","summary":"  The power of DNNs relies heavily on the quantity and quality of training\ndata. However, collecting and annotating data on a large scale is often\nexpensive and time-consuming. To address this issue, we explore a new task,\ntermed dataset expansion, aimed at expanding a ready-to-use small dataset by\nautomatically creating new labeled samples. To this end, we present a Guided\nImagination Framework (GIF) that leverages cutting-edge generative models like\nDALL-E2 and Stable Diffusion (SD) to \"imagine\" and create informative new data\nfrom the input seed data. Specifically, GIF conducts data imagination by\noptimizing the latent features of the seed data in the semantically meaningful\nspace of the prior model, resulting in the creation of photo-realistic images\nwith new content. To guide the imagination towards creating informative samples\nfor model training, we introduce two key criteria, i.e., class-maintained\ninformation boosting and sample diversity promotion. These criteria are\nverified to be essential for effective dataset expansion: GIF-SD obtains 13.5%\nhigher model accuracy on natural image datasets than unguided expansion with\nSD. With these essential criteria, GIF successfully expands small datasets in\nvarious scenarios, boosting model accuracy by 36.9% on average over six natural\nimage datasets and by 13.5% on average over three medical datasets. The source\ncode is available at https://github.com/Vanint/DatasetExpansion.\n","authors":["Yifan Zhang","Daquan Zhou","Bryan Hooi","Kai Wang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2211.13976v4.pdf","comment":"NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion"},{"id":"http://arxiv.org/abs/2310.02174v1","updated":"2023-10-03T16:08:41Z","published":"2023-10-03T16:08:41Z","title":"Ask Again, Then Fail: Large Language Models' Vacillations in Judgement","summary":"  With the emergence of generative conversational large language models (LLMs)\nlike ChatGPT, serving as virtual assistants in various fields, the stability\nand reliability of their responses have become crucial. However, during usage,\nit has been observed that these models tend to waver in their judgements when\nconfronted with follow-up questions from users expressing skepticism or\ndisagreement. In this work, we draw inspiration from questioning strategies in\neducation and propose a \\textsc{Follow-up Questioning Mechanism} along with two\nevaluation metrics to assess the judgement consistency of LLMs before and after\nexposure to disturbances. We evaluate the judgement consistency of ChatGPT,\nPaLM2-Bison, and Vicuna-13B under this mechanism across eight reasoning\nbenchmarks. Empirical results show that even when the initial answers are\ncorrect, judgement consistency sharply decreases when LLMs face disturbances\nsuch as questioning, negation, or misleading. Additionally, we study these\nmodels' judgement consistency under various settings (sampling temperature and\nprompts) to validate this issue further, observing the impact of prompt tone\nand conducting an in-depth error analysis for deeper behavioral insights.\nFurthermore, we also explore several prompting methods to mitigate this issue\nand demonstrate their\neffectiveness\\footnote{\\url{https://github.com/NUSTM/LLMs-Waver-In-Judgements}}.\n","authors":["Qiming Xie","Zengzhi Wang","Yi Feng","Rui Xia"],"pdf_url":"https://arxiv.org/pdf/2310.02174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02172v1","updated":"2023-10-03T16:06:30Z","published":"2023-10-03T16:06:30Z","title":"Lyfe Agents: Generative agents for low-cost real-time social\n  interactions","summary":"  Highly autonomous generative agents powered by large language models promise\nto simulate intricate social behaviors in virtual societies. However, achieving\nreal-time interactions with humans at a low computational cost remains\nchallenging. Here, we introduce Lyfe Agents. They combine low-cost with\nreal-time responsiveness, all while remaining intelligent and goal-oriented.\nKey innovations include: (1) an option-action framework, reducing the cost of\nhigh-level decisions; (2) asynchronous self-monitoring for better\nself-consistency; and (3) a Summarize-and-Forget memory mechanism, prioritizing\ncritical memory items at a low cost. We evaluate Lyfe Agents' self-motivation\nand sociability across several multi-agent scenarios in our custom LyfeGame 3D\nvirtual environment platform. When equipped with our brain-inspired techniques,\nLyfe Agents can exhibit human-like self-motivated social reasoning. For\nexample, the agents can solve a crime (a murder mystery) through autonomous\ncollaboration and information exchange. Meanwhile, our techniques enabled Lyfe\nAgents to operate at a computational cost 10-100 times lower than existing\nalternatives. Our findings underscore the transformative potential of\nautonomous generative agents to enrich human social experiences in virtual\nworlds.\n","authors":["Zhao Kaiya","Michelangelo Naim","Jovana Kondic","Manuel Cortes","Jiaxin Ge","Shuying Luo","Guangyu Robert Yang","Andrew Ahn"],"pdf_url":"https://arxiv.org/pdf/2310.02172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02168v1","updated":"2023-10-03T16:02:36Z","published":"2023-10-03T16:02:36Z","title":"Editing Personality for LLMs","summary":"  This paper introduces an innovative task focused on editing the personality\ntraits of Large Language Models (LLMs). This task seeks to adjust the models'\nresponses to opinion-related questions on specified topics since an\nindividual's personality often manifests in the form of their expressed\nopinions, thereby showcasing different personality traits. Specifically, we\nconstruct a new benchmark dataset PersonalityEdit to address this task. Drawing\non the theory in Social Psychology, we isolate three representative traits,\nnamely Neuroticism, Extraversion, and Agreeableness, as the foundation for our\nbenchmark. We then gather data using GPT-4, generating responses that not only\nalign with a specified topic but also embody the targeted personality trait. We\nconduct comprehensive experiments involving various baselines and discuss the\nrepresentation of personality behavior in LLMs. Our intriguing findings uncover\npotential challenges of the proposed task, illustrating several remaining\nissues. We anticipate that our work can provide the NLP community with\ninsights. Code and datasets will be released at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Shengyu Mao","Ningyu Zhang","Xiaohan Wang","Mengru Wang","Yunzhi Yao","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02168v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.00987v2","updated":"2023-10-03T16:00:34Z","published":"2023-10-02T08:52:29Z","title":"A Theoretical Analysis of the Test Error of Finite-Rank Kernel Ridge\n  Regression","summary":"  Existing statistical learning guarantees for general kernel regressors often\nyield loose bounds when used with finite-rank kernels. Yet, finite-rank kernels\nnaturally appear in several machine learning problems, e.g.\\ when fine-tuning a\npre-trained deep neural network's last layer to adapt it to a novel task when\nperforming transfer learning. We address this gap for finite-rank kernel ridge\nregression (KRR) by deriving sharp non-asymptotic upper and lower bounds for\nthe KRR test error of any finite-rank KRR. Our bounds are tighter than\npreviously derived bounds on finite-rank KRR, and unlike comparable results,\nthey also remain valid for any regularization parameters.\n","authors":["Tin Sum Cheng","Aurelien Lucchi","Ivan Dokmanić","Anastasis Kratsios","David Belius"],"pdf_url":"https://arxiv.org/pdf/2310.00987v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00674v4","updated":"2023-10-03T15:50:30Z","published":"2023-02-01T18:59:36Z","title":"Improving Few-Shot Generalization by Exploring and Exploiting Auxiliary\n  Data","summary":"  Few-shot learning is valuable in many real-world applications, but learning a\ngeneralizable model without overfitting to the few labeled datapoints is\nchallenging. In this work, we focus on Few-shot Learning with Auxiliary Data\n(FLAD), a training paradigm that assumes access to auxiliary data during\nfew-shot learning in hopes of improving generalization. Previous works have\nproposed automated methods for mixing auxiliary and target data, but these\nmethods typically scale linearly (or worse) with the number of auxiliary\ndatasets, limiting their practicality. In this work we relate FLAD to the\nexplore-exploit dilemma that is central to the multi-armed bandit setting and\nderive algorithms whose computational complexity is independent of the number\nof auxiliary datasets, allowing us to scale to 100x more auxiliary datasets\nthan prior methods. We propose two algorithms -- EXP3-FLAD and UCB1-FLAD -- and\ncompare them with prior FLAD methods that either explore or exploit, finding\nthat the combination of exploration and exploitation is crucial. Through\nextensive experimentation we find that our methods outperform all pre-existing\nFLAD methods by 4% and lead to the first 3 billion parameter language models\nthat outperform the 175 billion parameter GPT-3. Overall, our work suggests\nthat the discovery of better, more efficient mixing strategies for FLAD may\nprovide a viable path towards substantially improving generalization in\nfew-shot learning.\n","authors":["Alon Albalak","Colin Raffel","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2302.00674v4.pdf","comment":"NeurIPS 2023, 25 pages, 8 figures, code available at\n  https://github.com/alon-albalak/FLAD"},{"id":"http://arxiv.org/abs/2306.00814v2","updated":"2023-10-03T15:49:34Z","published":"2023-06-01T15:40:32Z","title":"Vocos: Closing the gap between time-domain and Fourier-based neural\n  vocoders for high-quality audio synthesis","summary":"  Recent advancements in neural vocoding are predominantly driven by Generative\nAdversarial Networks (GANs) operating in the time-domain. While effective, this\napproach neglects the inductive bias offered by time-frequency representations,\nresulting in reduntant and computionally-intensive upsampling operations.\nFourier-based time-frequency representation is an appealing alternative,\naligning more accurately with human auditory perception, and benefitting from\nwell-established fast algorithms for its computation. Nevertheless, direct\nreconstruction of complex-valued spectrograms has been historically\nproblematic, primarily due to phase recovery issues. This study seeks to close\nthis gap by presenting Vocos, a new model that directly generates Fourier\nspectral coefficients. Vocos not only matches the state-of-the-art in audio\nquality, as demonstrated in our evaluations, but it also substantially improves\ncomputational efficiency, achieving an order of magnitude increase in speed\ncompared to prevailing time-domain neural vocoding approaches. The source code\nand model weights have been open-sourced at\nhttps://github.com/charactr-platform/vocos.\n","authors":["Hubert Siuzdak"],"pdf_url":"https://arxiv.org/pdf/2306.00814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02156v1","updated":"2023-10-03T15:43:59Z","published":"2023-10-03T15:43:59Z","title":"Probabilistically Rewired Message-Passing Neural Networks","summary":"  Message-passing graph neural networks (MPNNs) emerged as powerful tools for\nprocessing graph-structured input. However, they operate on a fixed input graph\nstructure, ignoring potential noise and missing information. Furthermore, their\nlocal aggregation mechanism can lead to problems such as over-squashing and\nlimited expressive power in capturing relevant graph structures. Existing\nsolutions to these challenges have primarily relied on heuristic methods, often\ndisregarding the underlying data distribution. Hence, devising principled\napproaches for learning to infer graph structures relevant to the given\nprediction task remains an open challenge. In this work, leveraging recent\nprogress in exact and differentiable $k$-subset sampling, we devise\nprobabilistically rewired MPNNs (PR-MPNNs), which learn to add relevant edges\nwhile omitting less beneficial ones. For the first time, our theoretical\nanalysis explores how PR-MPNNs enhance expressive power, and we identify\nprecise conditions under which they outperform purely randomized approaches.\nEmpirically, we demonstrate that our approach effectively mitigates issues like\nover-squashing and under-reaching. In addition, on established real-world\ndatasets, our method exhibits competitive or superior predictive performance\ncompared to traditional MPNN models and recent graph transformer architectures.\n","authors":["Chendi Qian","Andrei Manolache","Kareem Ahmed","Zhe Zeng","Guy Van den Broeck","Mathias Niepert","Christopher Morris"],"pdf_url":"https://arxiv.org/pdf/2310.02156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02152v1","updated":"2023-10-03T15:40:03Z","published":"2023-10-03T15:40:03Z","title":"Graph Neural Network-based EEG Classification: A Survey","summary":"  Graph neural networks (GNN) are increasingly used to classify EEG for tasks\nsuch as emotion recognition, motor imagery and neurological diseases and\ndisorders. A wide range of methods have been proposed to design GNN-based\nclassifiers. Therefore, there is a need for a systematic review and\ncategorisation of these approaches. We exhaustively search the published\nliterature on this topic and derive several categories for comparison. These\ncategories highlight the similarities and differences among the methods. The\nresults suggest a prevalence of spectral graph convolutional layers over\nspatial. Additionally, we identify standard forms of node features, with the\nmost popular being the raw EEG signal and differential entropy. Our results\nsummarise the emerging trends in GNN-based approaches for EEG classification.\nFinally, we discuss several promising research directions, such as exploring\nthe potential of transfer learning methods and appropriate modelling of\ncross-frequency interactions.\n","authors":["Dominik Klepl","Min Wu","Fei He"],"pdf_url":"https://arxiv.org/pdf/2310.02152v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.02147v1","updated":"2023-10-03T15:34:21Z","published":"2023-10-03T15:34:21Z","title":"Finite-Time Analysis of Whittle Index based Q-Learning for Restless\n  Multi-Armed Bandits with Neural Network Function Approximation","summary":"  Whittle index policy is a heuristic to the intractable restless multi-armed\nbandits (RMAB) problem. Although it is provably asymptotically optimal, finding\nWhittle indices remains difficult. In this paper, we present Neural-Q-Whittle,\na Whittle index based Q-learning algorithm for RMAB with neural network\nfunction approximation, which is an example of nonlinear two-timescale\nstochastic approximation with Q-function values updated on a faster timescale\nand Whittle indices on a slower timescale. Despite the empirical success of\ndeep Q-learning, the non-asymptotic convergence rate of Neural-Q-Whittle, which\ncouples neural networks with two-timescale Q-learning largely remains unclear.\nThis paper provides a finite-time analysis of Neural-Q-Whittle, where data are\ngenerated from a Markov chain, and Q-function is approximated by a ReLU neural\nnetwork. Our analysis leverages a Lyapunov drift approach to capture the\nevolution of two coupled parameters, and the nonlinearity in value function\napproximation further requires us to characterize the approximation error.\nCombing these provide Neural-Q-Whittle with $\\mathcal{O}(1/k^{2/3})$\nconvergence rate, where $k$ is the number of iterations.\n","authors":["Guojun Xiong","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2310.02147v1.pdf","comment":"26 pages, 4 figures, Neurips 2023"},{"id":"http://arxiv.org/abs/2310.02133v1","updated":"2023-10-03T15:14:28Z","published":"2023-10-03T15:14:28Z","title":"Learning Reliable Logical Rules with SATNet","summary":"  Bridging logical reasoning and deep learning is crucial for advanced AI\nsystems. In this work, we present a new framework that addresses this goal by\ngenerating interpretable and verifiable logical rules through differentiable\nlearning, without relying on pre-specified logical structures. Our approach\nbuilds upon SATNet, a differentiable MaxSAT solver that learns the underlying\nrules from input-output examples. Despite its efficacy, the learned weights in\nSATNet are not straightforwardly interpretable, failing to produce\nhuman-readable rules. To address this, we propose a novel specification method\ncalled \"maximum equality\", which enables the interchangeability between the\nlearned weights of SATNet and a set of propositional logical rules in weighted\nMaxSAT form. With the decoded weighted MaxSAT formula, we further introduce\nseveral effective verification techniques to validate it against the ground\ntruth rules. Experiments on stream transformations and Sudoku problems show\nthat our decoded rules are highly reliable: using exact solvers on them could\nachieve 100% accuracy, whereas the original SATNet fails to give correct\nsolutions in many cases. Furthermore, we formally verify that our decoded\nlogical rules are functionally equivalent to the ground truth ones.\n","authors":["Zhaoyu Li","Jinpei Guo","Yuhe Jiang","Xujie Si"],"pdf_url":"https://arxiv.org/pdf/2310.02133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02129v1","updated":"2023-10-03T15:10:46Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":"  As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code will be released at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.01259v2","updated":"2023-10-03T15:08:22Z","published":"2023-10-02T14:51:10Z","title":"Faster and Accurate Neural Networks with Semantic Inference","summary":"  Deep neural networks (DNN) usually come with a significant computational\nburden. While approaches such as structured pruning and mobile-specific DNNs\nhave been proposed, they incur drastic accuracy loss. In this paper we leverage\nthe intrinsic redundancy in latent representations to reduce the computational\nload with limited loss in performance. We show that semantically similar inputs\nshare many filters, especially in the earlier layers. Thus, semantically\nsimilar classes can be clustered to create cluster-specific subgraphs. To this\nend, we propose a new framework called Semantic Inference (SINF). In short,\nSINF (i) identifies the semantic cluster the object belongs to using a small\nadditional classifier and (ii) executes the subgraph extracted from the base\nDNN related to that semantic cluster for inference. To extract each\ncluster-specific subgraph, we propose a new approach named Discriminative\nCapability Score (DCS) that finds the subgraph with the capability to\ndiscriminate among the members of a specific semantic cluster. DCS is\nindependent from SINF and can be applied to any DNN. We benchmark the\nperformance of DCS on the VGG16, VGG19, and ResNet50 DNNs trained on the\nCIFAR100 dataset against 6 state-of-the-art pruning approaches. Our results\nshow that (i) SINF reduces the inference time of VGG19, VGG16, and ResNet50\nrespectively by up to 35%, 29% and 15% with only 0.17%, 3.75%, and 6.75%\naccuracy loss (ii) DCS achieves respectively up to 3.65%, 4.25%, and 2.36%\nbetter accuracy with VGG16, VGG19, and ResNet50 with respect to existing\ndiscriminative scores (iii) when used as a pruning criterion, DCS achieves up\nto 8.13% accuracy gain with 5.82% less parameters than the existing state of\nthe art work published at ICLR 2023 (iv) when considering per-cluster accuracy,\nSINF performs on average 5.73%, 8.38% and 6.36% better than the base VGG16,\nVGG19, and ResNet50.\n","authors":["Sazzad Sayyed","Jonathan Ashdown","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2310.01259v2.pdf","comment":"14 pages, 6 figures, conference format"},{"id":"http://arxiv.org/abs/2307.13903v2","updated":"2023-10-03T15:06:15Z","published":"2023-07-26T02:02:19Z","title":"Corruption-Robust Lipschitz Contextual Search","summary":"  I study the problem of learning a Lipschitz function with corrupted binary\nsignals. The learner tries to learn a $L$-Lipschitz function $f: [0,1]^d\n\\rightarrow [0, L]$ that the adversary chooses. There is a total of $T$ rounds.\nIn each round $t$, the adversary selects a context vector $x_t$ in the input\nspace, and the learner makes a guess to the true function value $f(x_t)$ and\nreceives a binary signal indicating whether the guess is high or low. In a\ntotal of $C$ rounds, the signal may be corrupted, though the value of $C$ is\n\\emph{unknown} to the learner. The learner's goal is to incur a small\ncumulative loss. This work introduces the new algorithmic technique\n\\emph{agnostic checking} as well as new analysis techniques. I design\nalgorithms which: for the symmetric loss, the learner achieves regret $L\\cdot\nO(C\\log T)$ with $d = 1$ and $L\\cdot O_d(C\\log T + T^{(d-1)/d})$ with $d > 1$;\nfor the pricing loss, the learner achieves regret $L\\cdot \\widetilde{O}\n(T^{d/(d+1)} + C\\cdot T^{1/(d+1)})$.\n","authors":["Shiliang Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.13903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02124v1","updated":"2023-10-03T15:05:52Z","published":"2023-10-03T15:05:52Z","title":"Exploring Collaboration Mechanisms for LLM Agents: A Social Psychology\n  View","summary":"  As Natural Language Processing (NLP) systems are increasingly employed in\nintricate social environments, a pressing query emerges: Can these NLP systems\nmirror human-esque collaborative intelligence, in a multi-agent society\nconsisting of multiple large language models (LLMs)? This paper probes the\ncollaboration mechanisms among contemporary NLP systems by melding practical\nexperiments with theoretical insights. We fabricate four unique `societies'\ncomprised of LLM agents, where each agent is characterized by a specific\n`trait' (easy-going or overconfident) and engages in collaboration with a\ndistinct `thinking pattern' (debate or reflection). Evaluating these\nmulti-agent societies on three benchmark datasets, we discern that LLM agents\nnavigate tasks by leveraging diverse social behaviors, from active debates to\nintrospective reflections. Notably, certain collaborative strategies only\noptimize efficiency (using fewer API tokens), but also outshine previous\ntop-tier approaches. Moreover, our results further illustrate that LLM agents\nmanifest human-like social behaviors, such as conformity or majority rule,\nmirroring foundational Social Psychology theories. In conclusion, we integrate\ninsights from Social Psychology to contextualize the collaboration of LLM\nagents, inspiring further investigations into the collaboration mechanism for\nLLMs. We commit to sharing our code and datasets (already submitted in\nsupplementary materials), hoping to catalyze further research in this promising\navenue (All code and data are available at\n\\url{https://github.com/zjunlp/MachineSoM}.).\n","authors":["Jintian Zhang","Xin Xu","Shumin Deng"],"pdf_url":"https://arxiv.org/pdf/2310.02124v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2103.00558v5","updated":"2023-10-03T14:59:09Z","published":"2021-02-28T16:43:37Z","title":"Is Simple Uniform Sampling Effective for Center-Based Clustering with\n  Outliers: When and Why?","summary":"  Real-world datasets often contain outliers, and the presence of outliers can\nmake the clustering problems to be much more challenging. In this paper, we\npropose a simple uniform sampling framework for solving three representative\ncenter-based clustering with outliers problems: $k$-center/median/means\nclustering with outliers. Our analysis is fundamentally different from the\nprevious (uniform and non-uniform) sampling based ideas. To explain the\neffectiveness of uniform sampling in theory, we introduce a measure of\n\"significance\" and prove that the performance of our framework depends on the\nsignificance degree of the given instance. In particular, the sample size can\nbe independent of the input data size $n$ and the dimensionality $d$, if we\nassume the given instance is \"significant\", which is in fact a fairly\nreasonable assumption in practice. Due to its simplicity, the uniform sampling\napproach also enjoys several significant advantages over the non-uniform\nsampling approaches in practice. To the best of our knowledge, this is the\nfirst work that systematically studies the effectiveness of uniform sampling\nfrom both theoretical and experimental aspects.\n","authors":["Jiawei Huang","Wenjie Liu","Hu Ding"],"pdf_url":"https://arxiv.org/pdf/2103.00558v5.pdf","comment":"arXiv admin note: text overlap with arXiv:1905.10143"},{"id":"http://arxiv.org/abs/2310.02117v1","updated":"2023-10-03T14:59:00Z","published":"2023-10-03T14:59:00Z","title":"Symmetric Single Index Learning","summary":"  Few neural architectures lend themselves to provable learning with gradient\nbased methods. One popular model is the single-index model, in which labels are\nproduced by composing an unknown linear projection with a possibly unknown\nscalar link function. Learning this model with SGD is relatively\nwell-understood, whereby the so-called information exponent of the link\nfunction governs a polynomial sample complexity rate. However, extending this\nanalysis to deeper or more complicated architectures remains challenging.\n  In this work, we consider single index learning in the setting of symmetric\nneural networks. Under analytic assumptions on the activation and maximum\ndegree assumptions on the link function, we prove that gradient flow recovers\nthe hidden planted direction, represented as a finitely supported vector in the\nfeature space of power sum polynomials. We characterize a notion of information\nexponent adapted to our setting that controls the efficiency of learning.\n","authors":["Aaron Zweig","Joan Bruna"],"pdf_url":"https://arxiv.org/pdf/2310.02117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12227v3","updated":"2023-10-03T14:58:20Z","published":"2023-01-28T15:35:52Z","title":"Deep Operator Learning Lessens the Curse of Dimensionality for PDEs","summary":"  Deep neural networks (DNNs) have achieved remarkable success in numerous\ndomains, and their application to PDE-related problems has been rapidly\nadvancing. This paper provides an estimate for the generalization error of\nlearning Lipschitz operators over Banach spaces using DNNs with applications to\nvarious PDE solution operators. The goal is to specify DNN width, depth, and\nthe number of training samples needed to guarantee a certain testing error.\nUnder mild assumptions on data distributions or operator structures, our\nanalysis shows that deep operator learning can have a relaxed dependence on the\ndiscretization resolution of PDEs and, hence, lessen the curse of\ndimensionality in many PDE-related problems including elliptic equations,\nparabolic equations, and Burgers equations. Our results are also applied to\ngive insights about discretization-invariance in operator learning.\n","authors":["Ke Chen","Chunmei Wang","Haizhao Yang"],"pdf_url":"https://arxiv.org/pdf/2301.12227v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02116v1","updated":"2023-10-03T14:57:31Z","published":"2023-10-03T14:57:31Z","title":"Hierarchical Concept Discovery Models: A Concept Pyramid Scheme","summary":"  Deep Learning algorithms have recently gained significant attention due to\ntheir impressive performance. However, their high complexity and\nun-interpretable mode of operation hinders their confident deployment in\nreal-world safety-critical tasks. This work targets ante hoc interpretability,\nand specifically Concept Bottleneck Models (CBMs). Our goal is to design a\nframework that admits a highly interpretable decision making process with\nrespect to human understandable concepts, on multiple levels of granularity. To\nthis end, we propose a novel hierarchical concept discovery formulation\nleveraging: (i) recent advances in image-text models, and (ii) an innovative\nformulation for multi-level concept selection via data-driven and sparsity\ninducing Bayesian arguments. Within this framework, concept information does\nnot solely rely on the similarity between the whole image and general\nunstructured concepts; instead, we introduce the notion of concept hierarchy to\nuncover and exploit more granular concept information residing in\npatch-specific regions of the image scene. As we experimentally show, the\nproposed construction not only outperforms recent CBM approaches, but also\nyields a principled framework towards interpetability.\n","authors":["Konstantinos P. Panousis","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2310.02116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00492v2","updated":"2023-10-03T14:55:55Z","published":"2023-03-01T13:27:06Z","title":"Lumos: Heterogeneity-aware Federated Graph Learning over Decentralized\n  Devices","summary":"  Graph neural networks (GNN) have been widely deployed in real-world networked\napplications and systems due to their capability to handle graph-structured\ndata. However, the growing awareness of data privacy severely challenges the\ntraditional centralized model training paradigm, where a server holds all the\ngraph information. Federated learning is an emerging collaborative computing\nparadigm that allows model training without data centralization. Existing\nfederated GNN studies mainly focus on systems where clients hold distinctive\ngraphs or sub-graphs. The practical node-level federated situation, where each\nclient is only aware of its direct neighbors, has yet to be studied. In this\npaper, we propose the first federated GNN framework called Lumos that supports\nsupervised and unsupervised learning with feature and degree protection on\nnode-level federated graphs. We first design a tree constructor to improve the\nrepresentation capability given the limited structural information. We further\npresent a Monte Carlo Markov Chain-based algorithm to mitigate the workload\nimbalance caused by degree heterogeneity with theoretically-guaranteed\nperformance. Based on the constructed tree for each client, a decentralized\ntree-based GNN trainer is proposed to support versatile training. Extensive\nexperiments demonstrate that Lumos outperforms the baseline with significantly\nhigher accuracy and greatly reduced communication cost and training time.\n","authors":["Qiying Pan","Yifei Zhu","Lingyang Chu"],"pdf_url":"https://arxiv.org/pdf/2303.00492v2.pdf","comment":"13 pages, 7 figures, published in the Proceedings of the 39th IEEE\n  International Conference on Data Engineering (ICDE 2023)"},{"id":"http://arxiv.org/abs/2310.02113v1","updated":"2023-10-03T14:55:30Z","published":"2023-10-03T14:55:30Z","title":"FLEDGE: Ledger-based Federated Learning Resilient to Inference and\n  Backdoor Attacks","summary":"  Federated learning (FL) is a distributed learning process that uses a trusted\naggregation server to allow multiple parties (or clients) to collaboratively\ntrain a machine learning model without having them share their private data.\nRecent research, however, has demonstrated the effectiveness of inference and\npoisoning attacks on FL. Mitigating both attacks simultaneously is very\nchallenging. State-of-the-art solutions have proposed the use of poisoning\ndefenses with Secure Multi-Party Computation (SMPC) and/or Differential Privacy\n(DP). However, these techniques are not efficient and fail to address the\nmalicious intent behind the attacks, i.e., adversaries (curious servers and/or\ncompromised clients) seek to exploit a system for monetization purposes. To\novercome these limitations, we present a ledger-based FL framework known as\nFLEDGE that allows making parties accountable for their behavior and achieve\nreasonable efficiency for mitigating inference and poisoning attacks. Our\nsolution leverages crypto-currency to increase party accountability by\npenalizing malicious behavior and rewarding benign conduct. We conduct an\nextensive evaluation on four public datasets: Reddit, MNIST, Fashion-MNIST, and\nCIFAR-10. Our experimental results demonstrate that (1) FLEDGE provides strong\nprivacy guarantees for model updates without sacrificing model utility; (2)\nFLEDGE can successfully mitigate different poisoning attacks without degrading\nthe performance of the global model; and (3) FLEDGE offers unique reward\nmechanisms to promote benign behavior during model training and/or model\naggregation.\n","authors":["Jorge Castillo","Phillip Rieger","Hossein Fereidooni","Qian Chen","Ahmad Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2310.02113v1.pdf","comment":"To appear in Annual Computer Security Applications Conference (ACSAC)\n  2023"},{"id":"http://arxiv.org/abs/2109.03459v3","updated":"2023-10-03T14:55:06Z","published":"2021-09-08T07:00:45Z","title":"Dual Correction Strategy for Ranking Distillation in Top-N Recommender\n  System","summary":"  Knowledge Distillation (KD), which transfers the knowledge of a well-trained\nlarge model (teacher) to a small model (student), has become an important area\nof research for practical deployment of recommender systems. Recently, Relaxed\nRanking Distillation (RRD) has shown that distilling the ranking information in\nthe recommendation list significantly improves the performance. However, the\nmethod still has limitations in that 1) it does not fully utilize the\nprediction errors of the student model, which makes the training not fully\nefficient, and 2) it only distills the user-side ranking information, which\nprovides an insufficient view under the sparse implicit feedback. This paper\npresents Dual Correction strategy for Distillation (DCD), which transfers the\nranking information from the teacher model to the student model in a more\nefficient manner. Most importantly, DCD uses the discrepancy between the\nteacher model and the student model predictions to decide which knowledge to be\ndistilled. By doing so, DCD essentially provides the learning guidance tailored\nto \"correcting\" what the student model has failed to accurately predict. This\nprocess is applied for transferring the ranking information from the user-side\nas well as the item-side to address sparse implicit user feedback. Our\nexperiments show that the proposed method outperforms the state-of-the-art\nbaselines, and ablation studies validate the effectiveness of each component.\n","authors":["Youngjune Lee","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2109.03459v3.pdf","comment":"CIKM 2021"},{"id":"http://arxiv.org/abs/2104.00253v4","updated":"2023-10-03T14:48:16Z","published":"2021-04-01T04:40:22Z","title":"Deep Contrastive Patch-Based Subspace Learning for Camera Image Signal\n  Processing","summary":"  Camera Image Signal Processing (ISP) pipelines can get appealing results in\ndifferent image signal processing tasks. Nonetheless, the majority of these\nmethods, including those employing an encoder-decoder deep architecture for the\ntask, typically utilize a uniform filter applied consistently across the entire\nimage. However, it is natural to view a camera image as heterogeneous, as the\ncolor intensity and the artificial noise are distributed vastly differently,\neven across the two-dimensional domain of a single image. Varied Moire ringing,\nmotion blur, color-bleaching, or lens-based projection distortions can all\npotentially lead to a heterogeneous image artifact filtering problem. In this\npaper, we present a specific patch-based, local subspace deep neural network\nthat improves Camera ISP to be robust to heterogeneous artifacts (especially\nimage denoising). We call our three-fold deep-trained model the Patch Subspace\nLearning Autoencoder (PSL-AE). The PSL-AE model does not make assumptions\nregarding uniform levels of image distortion. Instead, it first encodes patches\nextracted from noisy a nd clean image pairs, with different artifact types or\ndistortion levels, by contrastive learning. Then, the patches of each image are\nencoded into corresponding soft clusters within their suitable latent\nsub-space, utilizing a prior mixture model. Furthermore, the decoders undergo\ntraining in an unsupervised manner, specifically trained for the image patches\npresent in each cluster. The experiments highlight the adaptability and\nefficacy through enhanced heterogeneous filtering, both from synthesized\nartifacts but also realistic SIDD image pairs.\n","authors":["Yunhao Yang","Yi Wang","Chandrajit Bajaj"],"pdf_url":"https://arxiv.org/pdf/2104.00253v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16789v2","updated":"2023-10-03T14:45:48Z","published":"2023-07-31T15:56:53Z","title":"ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world\n  APIs","summary":"  Despite the advancements of open-source large language models (LLMs), e.g.,\nLLaMA, they remain significantly limited in tool-use capabilities, i.e., using\nexternal tools (APIs) to fulfill human instructions. The reason is that current\ninstruction tuning largely focuses on basic language tasks but ignores the\ntool-use domain. This is in contrast to the excellent tool-use capabilities of\nstate-of-the-art (SOTA) closed-source LLMs, e.g., ChatGPT. To bridge this gap,\nwe introduce ToolLLM, a general tool-use framework encompassing data\nconstruction, model training, and evaluation. We first present ToolBench, an\ninstruction-tuning dataset for tool use, which is constructed automatically\nusing ChatGPT. Specifically, the construction can be divided into three stages:\n(i) API collection: we collect 16,464 real-world RESTful APIs spanning 49\ncategories from RapidAPI Hub; (ii) instruction generation: we prompt ChatGPT to\ngenerate diverse instructions involving these APIs, covering both single-tool\nand multi-tool scenarios; (iii) solution path annotation: we use ChatGPT to\nsearch for a valid solution path (chain of API calls) for each instruction. To\nenhance the reasoning capabilities of LLMs, we develop a novel depth-first\nsearch-based decision tree algorithm. It enables LLMs to evaluate multiple\nreasoning traces and expand the search space. Moreover, to evaluate the\ntool-use capabilities of LLMs, we develop an automatic evaluator: ToolEval.\nBased on ToolBench, we fine-tune LLaMA to obtain an LLM ToolLLaMA, and equip it\nwith a neural API retriever to recommend appropriate APIs for each instruction.\nExperiments show that ToolLLaMA demonstrates a remarkable ability to execute\ncomplex instructions and generalize to unseen APIs, and exhibits comparable\nperformance to ChatGPT. Our ToolLLaMA also demonstrates strong zero-shot\ngeneralization ability in an out-of-distribution tool-use dataset: APIBench.\n","authors":["Yujia Qin","Shihao Liang","Yining Ye","Kunlun Zhu","Lan Yan","Yaxi Lu","Yankai Lin","Xin Cong","Xiangru Tang","Bill Qian","Sihan Zhao","Lauren Hong","Runchu Tian","Ruobing Xie","Jie Zhou","Mark Gerstein","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02094v1","updated":"2023-10-03T14:38:12Z","published":"2023-10-03T14:38:12Z","title":"CoNO: Complex Neural Operator for Continuous Dynamical Systems","summary":"  Neural operators extend data-driven models to map between\ninfinite-dimensional functional spaces. These models have successfully solved\ncontinuous dynamical systems represented by differential equations, viz weather\nforecasting, fluid flow, or solid mechanics. However, the existing operators\nstill rely on real space, thereby losing rich representations potentially\ncaptured in the complex space by functional transforms. In this paper, we\nintroduce a Complex Neural Operator (CoNO), that parameterizes the integral\nkernel in the complex fractional Fourier domain. Additionally, the model\nemploying a complex-valued neural network along with aliasing-free activation\nfunctions preserves the complex values and complex algebraic properties,\nthereby enabling improved representation, robustness to noise, and\ngeneralization. We show that the model effectively captures the underlying\npartial differential equation with a single complex fractional Fourier\ntransform. We perform an extensive empirical evaluation of CoNO on several\ndatasets and additional tasks such as zero-shot super-resolution, evaluation of\nout-of-distribution data, data efficiency, and robustness to noise. CoNO\nexhibits comparable or superior performance to all the state-of-the-art models\nin these tasks. Altogether, CoNO presents a robust and superior model for\nmodeling continuous dynamical systems, providing a fillip to scientific machine\nlearning.\n","authors":["Karn Tiwari","N M Anoop Krishnan","Prathosh A P"],"pdf_url":"https://arxiv.org/pdf/2310.02094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02093v1","updated":"2023-10-03T14:36:05Z","published":"2023-10-03T14:36:05Z","title":"Stochastic Gradient Descent with Preconditioned Polyak Step-size","summary":"  Stochastic Gradient Descent (SGD) is one of the many iterative optimization\nmethods that are widely used in solving machine learning problems. These\nmethods display valuable properties and attract researchers and industrial\nmachine learning engineers with their simplicity. However, one of the\nweaknesses of this type of methods is the necessity to tune learning rate\n(step-size) for every loss function and dataset combination to solve an\noptimization problem and get an efficient performance in a given time budget.\nStochastic Gradient Descent with Polyak Step-size (SPS) is a method that offers\nan update rule that alleviates the need of fine-tuning the learning rate of an\noptimizer. In this paper, we propose an extension of SPS that employs\npreconditioning techniques, such as Hutchinson's method, Adam, and AdaGrad, to\nimprove its performance on badly scaled and/or ill-conditioned datasets.\n","authors":["Farshed Abdukhakimov","Chulu Xiang","Dmitry Kamzolov","Martin Takáč"],"pdf_url":"https://arxiv.org/pdf/2310.02093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02090v1","updated":"2023-10-03T14:33:34Z","published":"2023-10-03T14:33:34Z","title":"1D-CapsNet-LSTM: A Deep Learning-Based Model for Multi-Step Stock Index\n  Forecasting","summary":"  Multi-step forecasting of stock market index prices is a crucial task in the\nfinancial sector, playing a pivotal role in decision-making across various\nfinancial activities. However, forecasting results are often unsatisfactory\nowing to the stochastic and volatile nature of the data. Researchers have made\nvarious attempts, and this process is ongoing. Inspired by convolutional neural\nnetwork long short-term memory (CNN-LSTM) networks that utilize a 1D CNN for\nfeature extraction to boost model performance, this study explores the use of a\ncapsule network (CapsNet) as an advanced feature extractor in an LSTM-based\nforecasting model to enhance multi-step predictions. To this end, a novel\nneural architecture called 1D-CapsNet-LSTM was introduced, which combines a 1D\nCapsNet to extract high-level features from 1D sequential data and an LSTM\nlayer to capture the temporal dependencies between the previously extracted\nfeatures and uses a multi-input multi-output (MIMO) strategy to maintain the\nstochastic dependencies between the predicted values at different time steps.\nThe proposed model was evaluated based on several real-world stock market\nindices, including Standard & Poor's 500 (S&P 500), Dow Jones Industrial\nAverage (DJIA), Nasdaq Composite Index (IXIC), and New York Stock Exchange\n(NYSE), and was compared with baseline models such as LSTM, recurrent neural\nnetwork (RNN), and CNN-LSTM in terms of various evaluation metrics. The\ncomparison results suggest that the 1D-CapsNet-LSTM model outperforms the\nbaseline models and has immense potential for the effective handling of complex\nprediction tasks.\n","authors":["Cheng Zhang","Nilam Nur Amir Sjarif","Roslina Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2310.02090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05628v2","updated":"2023-10-03T14:32:11Z","published":"2023-03-10T00:11:18Z","title":"On the Unlikelihood of D-Separation","summary":"  Causal discovery aims to recover a causal graph from data generated by it;\nconstraint based methods do so by searching for a d-separating conditioning set\nof nodes in the graph via an oracle. In this paper, we provide analytic\nevidence that on large graphs, d-separation is a rare phenomenon, even when\nguaranteed to exist, unless the graph is extremely sparse. We then provide an\nanalytic average case analysis of the PC Algorithm for causal discovery, as\nwell as a variant of the SGS Algorithm we call UniformSGS. We consider a set\n$V=\\{v_1,\\ldots,v_n\\}$ of nodes, and generate a random DAG $G=(V,E)$ where\n$(v_a, v_b) \\in E$ with i.i.d. probability $p_1$ if $a<b$ and $0$ if $a > b$.\nWe provide upper bounds on the probability that a subset of $V-\\{x,y\\}$\nd-separates $x$ and $y$, conditional on $x$ and $y$ being d-separable; our\nupper bounds decay exponentially fast to $0$ as $|V| \\rightarrow \\infty$. For\nthe PC Algorithm, while it is known that its worst-case guarantees fail on\nnon-sparse graphs, we show that the same is true for the average case, and that\nthe sparsity requirement is quite demanding: for good performance, the density\nmust go to $0$ as $|V| \\rightarrow \\infty$ even in the average case. For\nUniformSGS, while it is known that the running time is exponential for existing\nedges, we show that in the average case, that is the expected running time for\nmost non-existing edges as well.\n","authors":["Itai Feigenbaum","Huan Wang","Shelby Heinecke","Juan Carlos Niebles","Weiran Yao","Caiming Xiong","Devansh Arpit"],"pdf_url":"https://arxiv.org/pdf/2303.05628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08250v3","updated":"2023-10-03T14:22:29Z","published":"2023-03-14T21:52:27Z","title":"Transforming Transformers for Resilient Lifelong Learning","summary":"  Lifelong learning without catastrophic forgetting (i.e., resiliency) remains\nan open problem for deep neural networks. The prior art mostly focuses on\nconvolutional neural networks. With the increasing dominance of Transformers in\ndeep learning, it is a pressing need to study lifelong learning with\nTransformers. Due to the complexity of training Transformers in practice, for\nlifelong learning, a question naturally arises: Can Transformers be learned to\ngrow in a task aware way, that is to be dynamically transformed by introducing\nlightweight learnable plastic components to the architecture, while retaining\nthe parameter-heavy, but stable components at streaming tasks? To that end,\nmotivated by the lifelong learning capability maintained by the functionality\nof Hippocampi in human brain, we explore what would be, and how to implement,\nArtificial Hippocampi (ArtiHippo) in Transformers. We present a method to\nidentify, and learn to grow, ArtiHippo in Vision Transformers (ViTs) for\nresilient lifelong learning in four aspects: (i) Where to place ArtiHippo to\nenable plasticity while preserving the core function of ViTs at streaming\ntasks? (ii) How to represent and realize ArtiHippo to ensure expressivity and\nadaptivity for tackling tasks of different nature in lifelong learning? (iii)\nHow to learn to grow ArtiHippo to exploit task synergies (i.e., the learned\nknowledge) and overcome catastrophic forgetting? (iv) How to harness the best\nof our proposed ArtiHippo and prompting-based approaches? In experiments, we\ntest the proposed method on the challenging Visual Domain Decathlon (VDD)\nbenchmark and the 5-Dataset benchmark under the task-incremental lifelong\nlearning setting. It obtains consistently better performance than the prior art\nwith sensible ArtiHippo learned continually. To our knowledge, it is the first\nattempt of lifelong learning with ViTs on the challenging VDD benchmark.\n","authors":["Chinmay Savadikar","Michelle Dai","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2303.08250v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.12248v3","updated":"2023-10-03T14:17:38Z","published":"2020-08-17T16:08:55Z","title":"A Survey on Reinforcement Learning for Combinatorial Optimization","summary":"  This paper gives a detailed review of reinforcement learning (RL) in\ncombinatorial optimization, introduces the history of combinatorial\noptimization starting in the 1950s, and compares it with the RL algorithms of\nrecent years. This paper explicitly looks at a famous combinatorial\nproblem-traveling salesperson problem (TSP). It compares the approach of modern\nRL algorithms for the TSP with an approach published in the 1970s. By comparing\nthe similarities and variances between these methodologies, the paper\ndemonstrates how RL algorithms are optimized due to the evolution of machine\nlearning techniques and computing power. The paper then briefly introduces the\ndeep learning approach to the TSP named deep RL, which is an extension of the\ntraditional mathematical framework. In deep RL, attention and feature encoding\nmechanisms are introduced to generate near-optimal solutions. The survey shows\nthat integrating the deep learning mechanism, such as attention with RL, can\neffectively approximate the TSP. The paper also argues that deep learning could\nbe a generic approach that can be integrated with any traditional RL algorithm\nto enhance the outcomes of the TSP.\n","authors":["Yunhao Yang","Andrew Whinston"],"pdf_url":"https://arxiv.org/pdf/2008.12248v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02075v1","updated":"2023-10-03T14:15:20Z","published":"2023-10-03T14:15:20Z","title":"Learning Quantum Processes with Quantum Statistical Queries","summary":"  Learning complex quantum processes is a central challenge in many areas of\nquantum computing and quantum machine learning, with applications in quantum\nbenchmarking, cryptanalysis, and variational quantum algorithms. This paper\nintroduces the first learning framework for studying quantum process learning\nwithin the Quantum Statistical Query (QSQ) model, providing the first formal\ndefinition of statistical queries to quantum processes (QPSQs). The framework\nallows us to propose an efficient QPSQ learner for arbitrary quantum processes\naccompanied by a provable performance guarantee. We also provide numerical\nsimulations to demonstrate the efficacy of this algorithm. The practical\nrelevance of this framework is exemplified through application in\ncryptanalysis, highlighting vulnerabilities of Classical-Readout Quantum\nPhysical Unclonable Functions (CR-QPUFs), addressing an important open question\nin the field of quantum hardware security. This work marks a significant step\ntowards understanding the learnability of quantum processes and shedding light\non their security implications.\n","authors":["Chirag Wadhwa","Mina Doosti"],"pdf_url":"https://arxiv.org/pdf/2310.02075v1.pdf","comment":"30 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.02074v1","updated":"2023-10-03T14:15:06Z","published":"2023-10-03T14:15:06Z","title":"ACE: A fast, skillful learned global atmospheric model for climate\n  prediction","summary":"  Existing ML-based atmospheric models are not suitable for climate prediction,\nwhich requires long-term stability and physical consistency. We present ACE\n(AI2 Climate Emulator), a 200M-parameter, autoregressive machine learning\nemulator of an existing comprehensive 100-km resolution global atmospheric\nmodel. The formulation of ACE allows evaluation of physical laws such as the\nconservation of mass and moisture. The emulator is stable for 10 years, nearly\nconserves column moisture without explicit constraints and faithfully\nreproduces the reference model's climate, outperforming a challenging baseline\non over 80% of tracked variables. ACE requires nearly 100x less wall clock time\nand is 100x more energy efficient than the reference model using typically\navailable resources.\n","authors":["Oliver Watt-Meyer","Gideon Dresdner","Jeremy McGibbon","Spencer K. Clark","Brian Henn","James Duncan","Noah D. Brenowitz","Karthik Kashinath","Michael S. Pritchard","Boris Bonev","Matthew E. Peters","Christopher S. Bretherton"],"pdf_url":"https://arxiv.org/pdf/2310.02074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16916v2","updated":"2023-10-03T14:10:02Z","published":"2023-09-29T01:07:38Z","title":"ONNXExplainer: an ONNX Based Generic Framework to Explain Neural\n  Networks Using Shapley Values","summary":"  Understanding why a neural network model makes certain decisions can be as\nimportant as the inference performance. Various methods have been proposed to\nhelp practitioners explain the prediction of a neural network model, of which\nShapley values are most popular. SHAP package is a leading implementation of\nShapley values to explain neural networks implemented in TensorFlow or PyTorch\nbut lacks cross-platform support, one-shot deployment and is highly\ninefficient. To address these problems, we present the ONNXExplainer, which is\na generic framework to explain neural networks using Shapley values in the ONNX\necosystem. In ONNXExplainer, we develop its own automatic differentiation and\noptimization approach, which not only enables One-Shot Deployment of neural\nnetworks inference and explanations, but also significantly improves the\nefficiency to compute explanation with less memory consumption. For fair\ncomparison purposes, we also implement the same optimization in TensorFlow and\nPyTorch and measure its performance against the current state of the art\nopen-source counterpart, SHAP. Extensive benchmarks demonstrate that the\nproposed optimization approach improves the explanation latency of VGG19,\nResNet50, DenseNet201, and EfficientNetB0 by as much as 500%.\n","authors":["Yong Zhao","Runxin He","Nicholas Kersting","Can Liu","Shubham Agrawal","Chiranjeet Chetia","Yu Gu"],"pdf_url":"https://arxiv.org/pdf/2309.16916v2.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.02066v1","updated":"2023-10-03T14:09:15Z","published":"2023-10-03T14:09:15Z","title":"De Novo Drug Design with Joint Transformers","summary":"  De novo drug design requires simultaneously generating novel molecules\noutside of training data and predicting their target properties, making it a\nhard task for generative models. To address this, we propose Joint Transformer\nthat combines a Transformer decoder, a Transformer encoder, and a predictor in\na joint generative model with shared weights. We show that training the model\nwith a penalized log-likelihood objective results in state-of-the-art\nperformance in molecule generation, while decreasing the prediction error on\nnewly sampled molecules, as compared to a fine-tuned decoder-only Transformer,\nby 42%. Finally, we propose a probabilistic black-box optimization algorithm\nthat employs Joint Transformer to generate novel molecules with improved target\nproperties, as compared to the training data, outperforming other SMILES-based\noptimization methods in de novo drug design.\n","authors":["Adam Izdebski","Ewelina Weglarz-Tomczak","Ewa Szczurek","Jakub M. Tomczak"],"pdf_url":"https://arxiv.org/pdf/2310.02066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02065v1","updated":"2023-10-03T14:08:26Z","published":"2023-10-03T14:08:26Z","title":"VENOM: A Vectorized N:M Format for Unleashing the Power of Sparse Tensor\n  Cores","summary":"  The increasing success and scaling of Deep Learning models demands higher\ncomputational efficiency and power. Sparsification can lead to both smaller\nmodels as well as higher compute efficiency, and accelerated hardware is\nbecoming available. However, exploiting it efficiently requires kernel\nimplementations, pruning algorithms, and storage formats, to utilize hardware\nsupport of specialized sparse vector units. An example of those are the\nNVIDIA's Sparse Tensor Cores (SPTCs), which promise a 2x speedup. However,\nSPTCs only support the 2:4 format, limiting achievable sparsity ratios to 50%.\nWe present the V:N:M format, which enables the execution of arbitrary N:M\nratios on SPTCs. To efficiently exploit the resulting format, we propose\nSpatha, a high-performance sparse-library for DL routines. We show that Spatha\nachieves up to 37x speedup over cuBLAS. We also demonstrate a second-order\npruning technique that enables sparsification to high sparsity ratios with\nV:N:M and little to no loss in accuracy in modern transformers.\n","authors":["Roberto L. Castro","Andrei Ivanov","Diego Andrade","Tal Ben-Nun","Basilio B. Fraguela","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2310.02065v1.pdf","comment":"Accepted by 2023 International Conference on High Performance\n  Computing, Networking, Storage and Analysis, 2023 (SC'23)"},{"id":"http://arxiv.org/abs/2310.02063v1","updated":"2023-10-03T14:04:45Z","published":"2023-10-03T14:04:45Z","title":"Lessons Learned from EXMOS User Studies: A Technical Report Summarizing\n  Key Takeaways from User Studies Conducted to Evaluate The EXMOS Platform","summary":"  In the realm of interactive machine-learning systems, the provision of\nexplanations serves as a vital aid in the processes of debugging and enhancing\nprediction models. However, the extent to which various global model-centric\nand data-centric explanations can effectively assist domain experts in\ndetecting and resolving potential data-related issues for the purpose of model\nimprovement has remained largely unexplored. In this technical report, we\nsummarise the key findings of our two user studies. Our research involved a\ncomprehensive examination of the impact of global explanations rooted in both\ndata-centric and model-centric perspectives within systems designed to support\nhealthcare experts in optimising machine learning models through both automated\nand manual data configurations. To empirically investigate these dynamics, we\nconducted two user studies, comprising quantitative analysis involving a sample\nsize of 70 healthcare experts and qualitative assessments involving 30\nhealthcare experts. These studies were aimed at illuminating the influence of\ndifferent explanation types on three key dimensions: trust, understandability,\nand model improvement. Results show that global model-centric explanations\nalone are insufficient for effectively guiding users during the intricate\nprocess of data configuration. In contrast, data-centric explanations exhibited\ntheir potential by enhancing the understanding of system changes that occur\npost-configuration. However, a combination of both showed the highest level of\nefficacy for fostering trust, improving understandability, and facilitating\nmodel enhancement among healthcare experts. We also present essential\nimplications for developing interactive machine-learning systems driven by\nexplanations. These insights can guide the creation of more effective systems\nthat empower domain experts to harness the full potential of machine learning\n","authors":["Aditya Bhattacharya","Simone Stumpf","Lucija Gosak","Gregor Stiglic","Katrien Verbert"],"pdf_url":"https://arxiv.org/pdf/2310.02063v1.pdf","comment":"It is a technical report only. The contents are not peer-reviewed.\n  Please reach out to the main author for any questions"},{"id":"http://arxiv.org/abs/2201.12143v2","updated":"2023-10-03T13:58:09Z","published":"2022-01-28T14:29:25Z","title":"Locally Invariant Explanations: Towards Stable and Unidirectional\n  Explanations through Local Invariant Learning","summary":"  Locally interpretable model agnostic explanations (LIME) method is one of the\nmost popular methods used to explain black-box models at a per example level.\nAlthough many variants have been proposed, few provide a simple way to produce\nhigh fidelity explanations that are also stable and intuitive. In this work, we\nprovide a novel perspective by proposing a model agnostic local explanation\nmethod inspired by the invariant risk minimization (IRM) principle --\noriginally proposed for (global) out-of-distribution generalization -- to\nprovide such high fidelity explanations that are also stable and unidirectional\nacross nearby examples. Our method is based on a game theoretic formulation\nwhere we theoretically show that our approach has a strong tendency to\neliminate features where the gradient of the black-box function abruptly\nchanges sign in the locality of the example we want to explain, while in other\ncases it is more careful and will choose a more conservative (feature)\nattribution, a behavior which can be highly desirable for recourse.\nEmpirically, we show on tabular, image and text data that the quality of our\nexplanations with neighborhoods formed using random perturbations are much\nbetter than LIME and in some cases even comparable to other methods that use\nrealistic neighbors sampled from the data manifold. This is desirable given\nthat learning a manifold to either create realistic neighbors or to project\nexplanations is typically expensive or may even be impossible. Moreover, our\nalgorithm is simple and efficient to train, and can ascertain stable input\nfeatures for local decisions of a black-box without access to side information\nsuch as a (partial) causal graph as has been seen in some recent works.\n","authors":["Amit Dhurandhar","Karthikeyan Ramamurthy","Kartik Ahuja","Vijay Arya"],"pdf_url":"https://arxiv.org/pdf/2201.12143v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2303.13093v2","updated":"2023-10-03T13:56:51Z","published":"2023-03-23T08:17:10Z","title":"The Probabilistic Stability of Stochastic Gradient Descent","summary":"  Characterizing and understanding the stability of Stochastic Gradient Descent\n(SGD) remains an open problem in deep learning. A common method is to utilize\nthe convergence of statistical moments, esp. the variance, of the parameters to\nquantify the stability. We revisit the definition of stability for SGD and\npropose using the $\\textit{convergence in probability}$ condition to define the\n$\\textit{probabilistic stability}$ of SGD. The probabilistic stability sheds\nlight on a fundamental question in deep learning theory: how SGD selects a\nmeaningful solution for a neural network from an enormous number of possible\nsolutions that may severely overfit. We show that only through the lens of\nprobabilistic stability does SGD exhibit rich and practically relevant phases\nof learning, such as the phases of the complete loss of stability, incorrect\nlearning where the model captures incorrect data correlation, convergence to\nlow-rank saddles, and correct learning where the model captures the correct\ncorrelation. These phase boundaries are precisely quantified by the Lyapunov\nexponents of the dynamics. The obtained phase diagrams imply that SGD prefers\nlow-rank saddles in a neural network when the underlying gradient is noisy,\nthereby influencing the learning performance.\n","authors":["Liu Ziyin","Botao Li","Tomer Galanti","Masahito Ueda"],"pdf_url":"https://arxiv.org/pdf/2303.13093v2.pdf","comment":"preprint with revision"},{"id":"http://arxiv.org/abs/2310.02041v1","updated":"2023-10-03T13:34:21Z","published":"2023-10-03T13:34:21Z","title":"The Inhibitor: ReLU and Addition-Based Attention for Efficient\n  Transformers","summary":"  To enhance the computational efficiency of quantized Transformers, we replace\nthe dot-product and Softmax-based attention with an alternative mechanism\ninvolving addition and ReLU activation only. This side-steps the expansion to\ndouble precision often required by matrix multiplication and avoids costly\nSoftmax evaluations but maintains much of the core functionality of\nconventional dot-product attention. It can enable more efficient execution and\nsupport larger quantized Transformer models on resource-constrained hardware or\nalternative arithmetic systems like homomorphic encryption. Training\nexperiments on four common benchmark tasks show test set prediction scores\ncomparable to those of conventional Transformers with dot-product attention.\nOur scaling experiments also suggest significant computational savings, both in\nplaintext and under encryption. In particular, we believe that the ReLU and\naddition-based attention mechanism introduced in this paper may enable\nprivacy-preserving AI applications operating under homomorphic encryption by\navoiding the costly multiplication of encrypted variables.\n","authors":["Rickard Brännvall"],"pdf_url":"https://arxiv.org/pdf/2310.02041v1.pdf","comment":"8 pages, 3 tables"},{"id":"http://arxiv.org/abs/2302.10130v2","updated":"2023-10-03T13:22:42Z","published":"2023-02-20T18:00:38Z","title":"Infinite-Dimensional Diffusion Models","summary":"  Diffusion models have had a profound impact on many application areas,\nincluding those where data are intrinsically infinite-dimensional, such as\nimages or time series. The standard approach is first to discretize and then to\napply diffusion models to the discretized data. While such approaches are\npractically appealing, the performance of the resulting algorithms typically\ndeteriorates as discretization parameters are refined. In this paper, we\ninstead directly formulate diffusion-based generative models in infinite\ndimensions and apply them to the generative modeling of functions. We prove\nthat our formulations are well posed in the infinite-dimensional setting and\nprovide dimension-independent distance bounds from the sample to the target\nmeasure. Using our theory, we also develop guidelines for the design of\ninfinite-dimensional diffusion models. For image distributions, these\nguidelines are in line with the canonical choices currently made for diffusion\nmodels. For other distributions, however, we can improve upon these canonical\nchoices, which we show both theoretically and empirically, by applying the\nalgorithms to data distributions on manifolds and inspired by Bayesian inverse\nproblems or simulation-based inference.\n","authors":["Jakiw Pidstrigach","Youssef Marzouk","Sebastian Reich","Sven Wang"],"pdf_url":"https://arxiv.org/pdf/2302.10130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02032v1","updated":"2023-10-03T13:17:38Z","published":"2023-10-03T13:17:38Z","title":"aSAGA: Automatic Sleep Analysis with Gray Areas","summary":"  State-of-the-art automatic sleep staging methods have already demonstrated\ncomparable reliability and superior time efficiency to manual sleep staging.\nHowever, fully automatic black-box solutions are difficult to adapt into\nclinical workflow and the interaction between explainable automatic methods and\nthe work of sleep technologists remains underexplored and inadequately\nconceptualized. Thus, we propose a human-in-the-loop concept for sleep\nanalysis, presenting an automatic sleep staging model (aSAGA), that performs\neffectively with both clinical polysomnographic recordings and home sleep\nstudies. To validate the model, extensive testing was conducted, employing a\npreclinical validation approach with three retrospective datasets; open-access,\nclinical, and research-driven. Furthermore, we validate the utilization of\nuncertainty mapping to identify ambiguous regions, conceptualized as gray\nareas, in automatic sleep analysis that warrants manual re-evaluation. The\nresults demonstrate that the automatic sleep analysis achieved a comparable\nlevel of agreement with manual analysis across different sleep recording types.\nMoreover, validation of the gray area concept revealed its potential to enhance\nsleep staging accuracy and identify areas in the recordings where sleep\ntechnologists struggle to reach a consensus. In conclusion, this study\nintroduces and validates a concept from explainable artificial intelligence\ninto sleep medicine and provides the basis for integrating human-in-the-loop\nautomatic sleep staging into clinical workflows, aiming to reduce black-box\ncriticism and the burden associated with manual sleep staging.\n","authors":["Matias Rusanen","Gabriel Jouan","Riku Huttunen","Sami Nikkonen","Sigríður Sigurðardóttir","Juha Töyräs","Brett Duce","Sami Myllymaa","Erna Sif Arnardottir","Timo Leppänen","Anna Sigridur Islind","Samu Kainulainen","Henri Korkalainen"],"pdf_url":"https://arxiv.org/pdf/2310.02032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02031v1","updated":"2023-10-03T13:17:35Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":"  Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reason may be the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean\ndomain, which is expert in various ocean science tasks. We propose DoInstruct,\na novel framework to automatically obtain a large volume of ocean domain\ninstruction data, which generates instructions based on multi-agent\ncollaboration. Additionally, we construct the first oceanography benchmark,\nOceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though\ncomprehensive experiments, OceanGPT not only shows a higher level of knowledge\nexpertise for oceans science tasks but also gains preliminary embodied\nintelligence capabilities in ocean technology. Codes, data and checkpoints will\nsoon be available at https://github.com/zjunlp/KnowLM.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v1.pdf","comment":"Work in progress. Project Website:\n  https://zjunlp.github.io/project/OceanGPT/"},{"id":"http://arxiv.org/abs/2310.02029v1","updated":"2023-10-03T13:15:02Z","published":"2023-10-03T13:15:02Z","title":"Between accurate prediction and poor decision making: the AI/ML gap","summary":"  Intelligent agents rely on AI/ML functionalities to predict the consequence\nof possible actions and optimise the policy. However, the effort of the\nresearch community in addressing prediction accuracy has been so intense (and\nsuccessful) that it created the illusion that the more accurate the learner\nprediction (or classification) the better would have been the final decision.\nNow, such an assumption is valid only if the (human or artificial) decision\nmaker has complete knowledge of the utility of the possible actions. This paper\nargues that AI/ML community has taken so far a too unbalanced approach by\ndevoting excessive attention to the estimation of the state (or target)\nprobability to the detriment of accurate and reliable estimations of the\nutility. In particular, few evidence exists about the impact of a wrong utility\nassessment on the resulting expected utility of the decision strategy. This\nsituation is creating a substantial gap between the expectations and the\neffective impact of AI solutions, as witnessed by recent criticisms and\nemphasised by the regulatory legislative efforts. This paper aims to study this\ngap by quantifying the sensitivity of the expected utility to the utility\nuncertainty and comparing it to the one due to probability estimation.\nTheoretical and simulated results show that an inaccurate utility assessment\nmay as (and sometimes) more harmful than a poor probability estimation. The\nfinal recommendation to the community is then to undertake a focus shift from a\npure accuracy-driven (or obsessed) approach to a more utility-aware\nmethodology.\n","authors":["Gianluca Bontempi"],"pdf_url":"https://arxiv.org/pdf/2310.02029v1.pdf","comment":"Position paper presented in the BENELEARN 2022 conference"},{"id":"http://arxiv.org/abs/2310.02027v1","updated":"2023-10-03T13:10:14Z","published":"2023-10-03T13:10:14Z","title":"DeepHGCN: Toward Deeper Hyperbolic Graph Convolutional Networks","summary":"  Hyperbolic graph convolutional networks (HGCN) have demonstrated significant\npotential in extracting information from hierarchical graphs. However, existing\nHGCNs are limited to shallow architectures, due to the expensive hyperbolic\noperations and the over-smoothing issue as depth increases. Although in GCNs,\ntreatments have been applied to alleviate over-smoothing, developing a\nhyperbolic therapy presents distinct challenges since operations should be\ncarefully designed to fit the hyperbolic nature. Addressing the above\nchallenges, in this work, we propose DeepHGCN, the first deep multi-layer HGCN\narchitecture with dramatically improved computational efficiency and\nsubstantially alleviated over-smoothing effect. DeepHGCN presents two key\nenablers of deep HGCNs: (1) a novel hyperbolic feature transformation layer\nthat enables fast and accurate linear maps; and (2) Techniques such as\nhyperbolic residual connections and regularization for both weights and\nfeatures facilitated by an efficient hyperbolic midpoint method. Extensive\nexperiments demonstrate that DeepHGCN obtains significant improvements in link\nprediction and node classification tasks compared to both Euclidean and shallow\nhyperbolic GCN variants.\n","authors":["Jiaxu Liu","Xinping Yi","Xiaowei Huang"],"pdf_url":"https://arxiv.org/pdf/2310.02027v1.pdf","comment":"12 pages including appendix and reference"},{"id":"http://arxiv.org/abs/2108.11299v5","updated":"2023-10-03T13:08:50Z","published":"2021-08-25T15:49:10Z","title":"Certifiers Make Neural Networks Vulnerable to Availability Attacks","summary":"  To achieve reliable, robust, and safe AI systems, it is vital to implement\nfallback strategies when AI predictions cannot be trusted. Certifiers for\nneural networks are a reliable way to check the robustness of these\npredictions. They guarantee for some predictions that a certain class of\nmanipulations or attacks could not have changed the outcome. For the remaining\npredictions without guarantees, the method abstains from making a prediction,\nand a fallback strategy needs to be invoked, which typically incurs additional\ncosts, can require a human operator, or even fail to provide any prediction.\nWhile this is a key concept towards safe and secure AI, we show for the first\ntime that this approach comes with its own security risks, as such fallback\nstrategies can be deliberately triggered by an adversary. In addition to\nnaturally occurring abstains for some inputs and perturbations, the adversary\ncan use training-time attacks to deliberately trigger the fallback with high\nprobability. This transfers the main system load onto the fallback, reducing\nthe overall system's integrity and/or availability. We design two novel\navailability attacks, which show the practical relevance of these threats. For\nexample, adding 1% poisoned data during training is sufficient to trigger the\nfallback and hence make the model unavailable for up to 100% of all inputs by\ninserting the trigger. Our extensive experiments across multiple datasets,\nmodel architectures, and certifiers demonstrate the broad applicability of\nthese attacks. An initial investigation into potential defenses shows that\ncurrent approaches are insufficient to mitigate the issue, highlighting the\nneed for new, specific solutions.\n","authors":["Tobias Lorenz","Marta Kwiatkowska","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2108.11299v5.pdf","comment":"Published at 16th ACM Workshop on Artificial Intelligence and\n  Security (AISec '23)"},{"id":"http://arxiv.org/abs/2310.02025v1","updated":"2023-10-03T13:05:36Z","published":"2023-10-03T13:05:36Z","title":"DeepZero: Scaling up Zeroth-Order Optimization for Deep Model Training","summary":"  Zeroth-order (ZO) optimization has become a popular technique for solving\nmachine learning (ML) problems when first-order (FO) information is difficult\nor impossible to obtain. However, the scalability of ZO optimization remains an\nopen problem: Its use has primarily been limited to relatively small-scale ML\nproblems, such as sample-wise adversarial attack generation. To our best\nknowledge, no prior work has demonstrated the effectiveness of ZO optimization\nin training deep neural networks (DNNs) without a significant decrease in\nperformance. To overcome this roadblock, we develop DeepZero, a principled ZO\ndeep learning (DL) framework that can scale ZO optimization to DNN training\nfrom scratch through three primary innovations. First, we demonstrate the\nadvantages of coordinate-wise gradient estimation (CGE) over randomized\nvector-wise gradient estimation in training accuracy and computational\nefficiency. Second, we propose a sparsity-induced ZO training protocol that\nextends the model pruning methodology using only finite differences to explore\nand exploit the sparse DL prior in CGE. Third, we develop the methods of\nfeature reuse and forward parallelization to advance the practical\nimplementations of ZO training. Our extensive experiments show that DeepZero\nachieves state-of-the-art (SOTA) accuracy on ResNet-20 trained on CIFAR-10,\napproaching FO training performance for the first time. Furthermore, we show\nthe practical utility of DeepZero in applications of certified adversarial\ndefense and DL-based partial differential equation error correction, achieving\n10-20% improvement over SOTA. We believe our results will inspire future\nresearch on scalable ZO optimization and contribute to advancing DL with black\nbox.\n","authors":["Aochuan Chen","Yimeng Zhang","Jinghan Jia","James Diffenderfer","Jiancheng Liu","Konstantinos Parasyris","Yihua Zhang","Zheng Zhang","Bhavya Kailkhura","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02023v1","updated":"2023-10-03T12:58:10Z","published":"2023-10-03T12:58:10Z","title":"Nash Regret Guarantees for Linear Bandits","summary":"  We obtain essentially tight upper bounds for a strengthened notion of regret\nin the stochastic linear bandits framework. The strengthening -- referred to as\nNash regret -- is defined as the difference between the (a priori unknown)\noptimum and the geometric mean of expected rewards accumulated by the linear\nbandit algorithm. Since the geometric mean corresponds to the well-studied Nash\nsocial welfare (NSW) function, this formulation quantifies the performance of a\nbandit algorithm as the collective welfare it generates across rounds. NSW is\nknown to satisfy fairness axioms and, hence, an upper bound on Nash regret\nprovides a principled fairness guarantee.\n  We consider the stochastic linear bandits problem over a horizon of $T$\nrounds and with set of arms ${X}$ in ambient dimension $d$. Furthermore, we\nfocus on settings in which the stochastic reward -- associated with each arm in\n${X}$ -- is a non-negative, $\\nu$-sub-Poisson random variable. For this\nsetting, we develop an algorithm that achieves a Nash regret of $O\\left(\n\\sqrt{\\frac{d\\nu}{T}} \\log( T |X|)\\right)$. In addition, addressing linear\nbandit instances in which the set of arms ${X}$ is not necessarily finite, we\nobtain a Nash regret upper bound of $O\\left(\n\\frac{d^\\frac{5}{4}\\nu^{\\frac{1}{2}}}{\\sqrt{T}} \\log(T)\\right)$. Since bounded\nrandom variables are sub-Poisson, these results hold for bounded, positive\nrewards. Our linear bandit algorithm is built upon the successive elimination\nmethod with novel technical insights, including tailored concentration bounds\nand the use of sampling via John ellipsoid in conjunction with the\nKiefer-Wolfowitz optimal design.\n","authors":["Ayush Sawarni","Soumybrata Pal","Siddharth Barman"],"pdf_url":"https://arxiv.org/pdf/2310.02023v1.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2305.18403v3","updated":"2023-10-03T12:51:55Z","published":"2023-05-28T15:15:48Z","title":"LoRAPrune: Pruning Meets Low-Rank Parameter-Efficient Fine-Tuning","summary":"  Large pre-trained models (LPMs), such as LLaMA and GLM, have shown\nexceptional performance across various tasks through fine-tuning. Although\nlow-rank adaption (LoRA) has emerged to cheaply fine-tune these LPMs on\ndownstream tasks, their deployment is still hindered by the vast model scale\nand computational costs. Neural network pruning offers a way to compress LPMs.\nHowever, the current pruning methods designed for LPMs are not compatible with\nLoRA. This is due to their utilization of unstructured pruning on LPMs,\nimpeding the merging of LoRA weights, or their dependence on the gradients of\npre-trained weights to guide pruning, which can impose significant memory\noverhead. To this end, we propose LoRAPrune, a new framework that delivers an\naccurate, compact model for efficient inference in a highly memory-effective\nmanner. Specifically, we first design a LoRA-guided pruning criterion, which\nuses the weights and gradients of LoRA, rather than the gradients of\npre-trained weights for importance estimation. We then propose a structured\niterative pruning procedure, to remove redundant channels and heads. Extensive\nexperimental results demonstrate the superior performance of our LoRAPrune over\nexisting approaches on the LLaMA series models. For instance, at a 50\\%\ncompression rate, LoRAPrune outperforms LLM-Pruner by a perplexity reduction of\n8.0 on WikiText2 and 16.05 on PTB datasets, while concurrently reducing memory\nusage by 52.6\\%. The code will be released after review\n","authors":["Mingyang Zhang","Hao Chen","Chunhua Shen","Zhen Yang","Linlin Ou","Xinyi Yu","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2305.18403v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02016v1","updated":"2023-10-03T12:42:13Z","published":"2023-10-03T12:42:13Z","title":"Ranking a Set of Objects using Heterogeneous Workers: QUITE an Easy\n  Problem","summary":"  We focus on the problem of ranking $N$ objects starting from a set of noisy\npairwise comparisons provided by a crowd of unequal workers, each worker being\ncharacterized by a specific degree of reliability, which reflects her ability\nto rank pairs of objects. More specifically, we assume that objects are endowed\nwith intrinsic qualities and that the probability with which an object is\npreferred to another depends both on the difference between the qualities of\nthe two competitors and on the reliability of the worker. We propose QUITE, a\nnon-adaptive ranking algorithm that jointly estimates workers' reliabilities\nand qualities of objects. Performance of QUITE is compared in different\nscenarios against previously proposed algorithms. Finally, we show how QUITE\ncan be naturally made adaptive.\n","authors":["Alessandro Nordio","Alberto tarable","Emilio Leonardi"],"pdf_url":"https://arxiv.org/pdf/2310.02016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02013v1","updated":"2023-10-03T12:37:15Z","published":"2023-10-03T12:37:15Z","title":"Spectral operator learning for parametric PDEs without data reliance","summary":"  In this paper, we introduce the Spectral Coefficient Learning via Operator\nNetwork (SCLON), a novel operator learning-based approach for solving\nparametric partial differential equations (PDEs) without the need for data\nharnessing. The cornerstone of our method is the spectral methodology that\nemploys expansions using orthogonal functions, such as Fourier series and\nLegendre polynomials, enabling accurate PDE solutions with fewer grid points.\nBy merging the merits of spectral methods - encompassing high accuracy,\nefficiency, generalization, and the exact fulfillment of boundary conditions -\nwith the prowess of deep neural networks, SCLON offers a transformative\nstrategy. Our approach not only eliminates the need for paired input-output\ntraining data, which typically requires extensive numerical computations, but\nalso effectively learns and predicts solutions of complex parametric PDEs,\nranging from singularly perturbed convection-diffusion equations to the\nNavier-Stokes equations. The proposed framework demonstrates superior\nperformance compared to existing scientific machine learning techniques,\noffering solutions for multiple instances of parametric PDEs without harnessing\ndata. The mathematical framework is robust and reliable, with a well-developed\nloss function derived from the weak formulation, ensuring accurate\napproximation of solutions while exactly satisfying boundary conditions. The\nmethod's efficacy is further illustrated through its ability to accurately\npredict intricate natural behaviors like the Kolmogorov flow and boundary\nlayers. In essence, our work pioneers a compelling avenue for parametric PDE\nsolutions, serving as a bridge between traditional numerical methodologies and\ncutting-edge machine learning techniques in the realm of scientific\ncomputation.\n","authors":["Junho Choi","Taehyun Yun","Namjung Kim","Youngjoon Hong"],"pdf_url":"https://arxiv.org/pdf/2310.02013v1.pdf","comment":"28 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.02012v1","updated":"2023-10-03T12:35:02Z","published":"2023-10-03T12:35:02Z","title":"Towards Training Without Depth Limits: Batch Normalization Without\n  Gradient Explosion","summary":"  Normalization layers are one of the key building blocks for deep neural\nnetworks. Several theoretical studies have shown that batch normalization\nimproves the signal propagation, by avoiding the representations from becoming\ncollinear across the layers. However, results on mean-field theory of batch\nnormalization also conclude that this benefit comes at the expense of exploding\ngradients in depth. Motivated by these two aspects of batch normalization, in\nthis study we pose the following question: \"Can a batch-normalized network keep\nthe optimal signal propagation properties, but avoid exploding gradients?\" We\nanswer this question in the affirmative by giving a particular construction of\nan Multi-Layer Perceptron (MLP) with linear activations and batch-normalization\nthat provably has bounded gradients at any depth. Based on Weingarten calculus,\nwe develop a rigorous and non-asymptotic theory for this constructed MLP that\ngives a precise characterization of forward signal propagation, while proving\nthat gradients remain bounded for linearly independent input samples, which\nholds in most practical settings. Inspired by our theory, we also design an\nactivation shaping scheme that empirically achieves the same properties for\ncertain non-linear activations.\n","authors":["Alexandru Meterez","Amir Joudaki","Francesco Orabona","Alexander Immer","Gunnar Rätsch","Hadi Daneshmand"],"pdf_url":"https://arxiv.org/pdf/2310.02012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02011v1","updated":"2023-10-03T12:34:31Z","published":"2023-10-03T12:34:31Z","title":"Decoding Human Activities: Analyzing Wearable Accelerometer and\n  Gyroscope Data for Activity Recognition","summary":"  A person's movement or relative positioning effectively generates raw\nelectrical signals that can be read by computing machines to apply various\nmanipulative techniques for the classification of different human activities.\nIn this paper, a stratified multi-structural approach based on a Residual\nnetwork ensembled with Residual MobileNet is proposed, termed as FusionActNet.\nThe proposed method involves using carefully designed Residual blocks for\nclassifying the static and dynamic activities separately because they have\nclear and distinct characteristics that set them apart. These networks are\ntrained independently, resulting in two specialized and highly accurate models.\nThese models excel at recognizing activities within a specific superclass by\ntaking advantage of the unique algorithmic benefits of architectural\nadjustments. Afterward, these two ResNets are passed through a weighted\nensemble-based Residual MobileNet. Subsequently, this ensemble proficiently\ndiscriminates between a specific static and a specific dynamic activity, which\nwere previously identified based on their distinct feature characteristics in\nthe earlier stage. The proposed model is evaluated using two publicly\naccessible datasets; namely, UCI HAR and Motion-Sense. Therein, it successfully\nhandled the highly confusing cases of data overlap. Therefore, the proposed\napproach achieves a state-of-the-art accuracy of 96.71% and 95.35% in the UCI\nHAR and Motion-Sense datasets respectively.\n","authors":["Utsab Saha","Sawradip Saha","Tahmid Kabir","Shaikh Anowarul Fattah","Mohammad Saquib"],"pdf_url":"https://arxiv.org/pdf/2310.02011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02008v1","updated":"2023-10-03T12:24:51Z","published":"2023-10-03T12:24:51Z","title":"fmeffects: An R Package for Forward Marginal Effects","summary":"  Forward marginal effects (FMEs) have recently been introduced as a versatile\nand effective model-agnostic interpretation method. They provide comprehensible\nand actionable model explanations in the form of: If we change $x$ by an amount\n$h$, what is the change in predicted outcome $\\widehat{y}$? We present the R\npackage fmeffects, the first software implementation of FMEs. The relevant\ntheoretical background, package functionality and handling, as well as the\nsoftware design and options for future extensions are discussed in this paper.\n","authors":["Holger Löwe","Christian A. Scholbeck","Christian Heumann","Bernd Bischl","Giuseppe Casalicchio"],"pdf_url":"https://arxiv.org/pdf/2310.02008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02000v1","updated":"2023-10-03T12:19:19Z","published":"2023-10-03T12:19:19Z","title":"MUSCLE: Multi-task Self-supervised Continual Learning to Pre-train Deep\n  Models for X-ray Images of Multiple Body Parts","summary":"  While self-supervised learning (SSL) algorithms have been widely used to\npre-train deep models, few efforts [11] have been done to improve\nrepresentation learning of X-ray image analysis with SSL pre-trained models. In\nthis work, we study a novel self-supervised pre-training pipeline, namely\nMulti-task Self-super-vised Continual Learning (MUSCLE), for multiple medical\nimaging tasks, such as classification and segmentation, using X-ray images\ncollected from multiple body parts, including heads, lungs, and bones.\nSpecifically, MUSCLE aggregates X-rays collected from multiple body parts for\nMoCo-based representation learning, and adopts a well-designed continual\nlearning (CL) procedure to further pre-train the backbone subject various X-ray\nanalysis tasks jointly. Certain strategies for image pre-processing, learning\nschedules, and regularization have been used to solve data heterogeneity,\noverfitting, and catastrophic forgetting problems for multi-task/dataset\nlearning in MUSCLE.We evaluate MUSCLE using 9 real-world X-ray datasets with\nvarious tasks, including pneumonia classification, skeletal abnormality\nclassification, lung segmentation, and tuberculosis (TB) detection. Comparisons\nagainst other pre-trained models [7] confirm the proof-of-concept that\nself-supervised multi-task/dataset continual pre-training could boost the\nperformance of X-ray image analysis.\n","authors":["Weibin Liao","Haoyi Xiong","Qingzhong Wang","Yan Mo","Xuhong Li","Yi Liu","Zeyu Chen","Siyu Huang","Dejing Dou"],"pdf_url":"https://arxiv.org/pdf/2310.02000v1.pdf","comment":"accepted by Medical Image Computing and Computer Assisted\n  Intervention (MICCAI) 2022"},{"id":"http://arxiv.org/abs/2310.00604v2","updated":"2023-10-03T12:13:34Z","published":"2023-10-01T07:35:12Z","title":"Path Structured Multimarginal Schrödinger Bridge for Probabilistic\n  Learning of Hardware Resource Usage by Control Software","summary":"  The solution of the path structured multimarginal Schr\\\"{o}dinger bridge\nproblem (MSBP) is the most-likely measure-valued trajectory consistent with a\nsequence of observed probability measures or distributional snapshots. We\nleverage recent algorithmic advances in solving such structured MSBPs for\nlearning stochastic hardware resource usage by control software. The solution\nenables predicting the time-varying distribution of hardware resource\navailability at a desired time with guaranteed linear convergence. We\ndemonstrate the efficacy of our probabilistic learning approach in a model\npredictive control software execution case study. The method exhibits rapid\nconvergence to an accurate prediction of hardware resource utilization of the\ncontroller. The method can be broadly applied to any software to predict\ncyber-physical context-dependent performance at arbitrary time.\n","authors":["Georgiy A. Bondar","Robert Gifford","Linh Thi Xuan Phan","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2310.00604v2.pdf","comment":"8 pages, 6 figures. Submitted to American Control Conference (ACC)\n  2024"},{"id":"http://arxiv.org/abs/2309.12252v2","updated":"2023-10-03T12:10:58Z","published":"2023-09-21T16:52:34Z","title":"Parallelizing non-linear sequential models over the sequence length","summary":"  Sequential models, such as Recurrent Neural Networks and Neural Ordinary\nDifferential Equations, have long suffered from slow training due to their\ninherent sequential nature. For many years this bottleneck has persisted, as\nmany thought sequential models could not be parallelized. We challenge this\nlong-held belief with our parallel algorithm that accelerates GPU evaluation of\nsequential models by up to 3 orders of magnitude faster without compromising\noutput accuracy. The algorithm does not need any special structure in the\nsequential models' architecture, making it applicable to a wide range of\narchitectures. Using our method, training sequential models can be more than 10\ntimes faster than the common sequential method without any meaningful\ndifference in the training results. Leveraging this accelerated training, we\ndiscovered the efficacy of the Gated Recurrent Unit in a long time series\nclassification problem with 17k time samples. By overcoming the training\nbottleneck, our work serves as the first step to unlock the potential of\nnon-linear sequential models for long sequence problems.\n","authors":["Yi Heng Lim","Qi Zhu","Joshua Selfridge","Muhammad Firmansyah Kasim"],"pdf_url":"https://arxiv.org/pdf/2309.12252v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01991v1","updated":"2023-10-03T12:03:06Z","published":"2023-10-03T12:03:06Z","title":"Fill in the Blank: Exploring and Enhancing LLM Capabilities for Backward\n  Reasoning in Math Word Problems","summary":"  While forward reasoning (i.e. find the answer given the question) has been\nexplored extensively in the recent literature, backward reasoning is relatively\nunexplored. We examine the backward reasoning capabilities of LLMs on Math Word\nProblems (MWPs): given a mathematical question and its answer, with some\ndetails omitted from the question, can LLMs effectively retrieve the missing\ninformation?\n  In this paper, we formally define the backward reasoning task on math word\nproblems and modify three datasets to evaluate this task: GSM8k, SVAMP and\nMultiArith. Our findings show a significant drop in the accuracy of models on\nbackward reasoning compared to forward reasoning across four SOTA LLMs (GPT4,\nGPT3.5, PaLM-2, and LLaMa-2). Utilizing the specific format of this task, we\npropose three novel techniques that improve performance: Rephrase reformulates\nthe given problem into a forward reasoning problem, PAL-Tools combines the idea\nof Program-Aided LLMs to produce a set of equations that can be solved by an\nexternal solver, and Check your Work exploits the availability of natural\nverifier of high accuracy in the forward direction, interleaving solving and\nverification steps. Finally, realizing that each of our base methods correctly\nsolves a different set of problems, we propose a novel Bayesian formulation for\ncreating an ensemble over these base methods aided by a verifier to further\nboost the accuracy by a significant margin. Extensive experimentation\ndemonstrates that our techniques successively improve the performance of LLMs\non the backward reasoning task, with the final ensemble-based method resulting\nin a substantial performance gain compared to the raw LLMs with standard\nprompting techniques such as chain-of-thought.\n","authors":["Aniruddha Deb","Neeva Oza","Sarthak Singla","Dinesh Khandelwal","Dinesh Garg","Parag Singla"],"pdf_url":"https://arxiv.org/pdf/2310.01991v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.14122v2","updated":"2023-10-03T12:03:05Z","published":"2023-05-23T14:46:32Z","title":"Transferring Learning Trajectories of Neural Networks","summary":"  Training deep neural networks (DNNs) is computationally expensive, which is\nproblematic especially when performing duplicated or similar training runs in\nmodel ensemble or fine-tuning pre-trained models, for example. Once we have\ntrained one DNN on some dataset, we have its learning trajectory (i.e., a\nsequence of intermediate parameters during training) which may potentially\ncontain useful information for learning the dataset. However, there has been no\nattempt to utilize such information of a given learning trajectory for another\ntraining. In this paper, we formulate the problem of \"transferring\" a given\nlearning trajectory from one initial parameter to another one (learning\ntransfer problem) and derive the first algorithm to approximately solve it by\nmatching gradients successively along the trajectory via permutation symmetry.\nWe empirically show that the transferred parameters achieve non-trivial\naccuracy before any direct training, and can be trained significantly faster\nthan training from scratch.\n","authors":["Daiki Chijiwa"],"pdf_url":"https://arxiv.org/pdf/2305.14122v2.pdf","comment":"v2: updates include theoretical analysis and additional experiments"},{"id":"http://arxiv.org/abs/2212.12921v3","updated":"2023-10-03T11:56:32Z","published":"2022-12-25T15:40:05Z","title":"Learning k-Level Sparse Neural Networks Using a New Generalized Weighted\n  Group Sparse Envelope Regularization","summary":"  We propose an efficient method to learn both unstructured and structured\nsparse neural networks during training, utilizing a novel generalization of the\nsparse envelope function (SEF) used as a regularizer, termed {\\itshape{weighted\ngroup sparse envelope function}} (WGSEF). The WGSEF acts as a neuron group\nselector, which is leveraged to induce structured sparsity. The method ensures\na hardware-friendly structured sparsity of a deep neural network (DNN) to\nefficiently accelerate the DNN's evaluation. Notably, the method is adaptable,\nletting any hardware specify group definitions, such as filters, channels,\nfilter shapes, layer depths, a single parameter (unstructured), etc. Owing to\nthe WGSEF's properties, the proposed method allows to a pre-define sparsity\nlevel that would be achieved at the training convergence, while maintaining\nnegligible network accuracy degradation or even improvement in the case of\nredundant parameters. We introduce an efficient technique to calculate the\nexact value of the WGSEF along with its proximal operator in a worst-case\ncomplexity of $O(n)$, where $n$ is the total number of group variables. In\naddition, we propose a proximal-gradient-based optimization method to train the\nmodel, that is, the non-convex minimization of the sum of the neural network\nloss and the WGSEF. Finally, we conduct an experiment and illustrate the\nefficiency of our proposed technique in terms of the completion ratio,\naccuracy, and inference latency.\n","authors":["Yehonathan Refael","Iftach Arbel","Wasim Huleihel"],"pdf_url":"https://arxiv.org/pdf/2212.12921v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01975v1","updated":"2023-10-03T11:31:37Z","published":"2023-10-03T11:31:37Z","title":"Benign Overfitting in Two-Layer ReLU Convolutional Neural Networks for\n  XOR Data","summary":"  Modern deep learning models are usually highly over-parameterized so that\nthey can overfit the training data. Surprisingly, such overfitting neural\nnetworks can usually still achieve high prediction accuracy. To study this\n\"benign overfitting\" phenomenon, a line of recent works has theoretically\nstudied the learning of linear models and two-layer neural networks. However,\nmost of these analyses are still limited to the very simple learning problems\nwhere the Bayes-optimal classifier is linear. In this work, we investigate a\nclass of XOR-type classification tasks with label-flipping noises. We show\nthat, under a certain condition on the sample complexity and signal-to-noise\nratio, an over-parameterized ReLU CNN trained by gradient descent can achieve\nnear Bayes-optimal accuracy. Moreover, we also establish a matching lower bound\nresult showing that when the previous condition is not satisfied, the\nprediction accuracy of the obtained CNN is an absolute constant away from the\nBayes-optimal rate. Our result demonstrates that CNNs have a remarkable\ncapacity to efficiently learn XOR problems, even in the presence of highly\ncorrelated features.\n","authors":["Xuran Meng","Difan Zou","Yuan Cao"],"pdf_url":"https://arxiv.org/pdf/2310.01975v1.pdf","comment":"74 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.01973v1","updated":"2023-10-03T11:30:50Z","published":"2023-10-03T11:30:50Z","title":"Federated Wasserstein Distance","summary":"  We introduce a principled way of computing the Wasserstein distance between\ntwo distributions in a federated manner. Namely, we show how to estimate the\nWasserstein distance between two samples stored and kept on different\ndevices/clients whilst a central entity/server orchestrates the computations\n(again, without having access to the samples). To achieve this feat, we take\nadvantage of the geometric properties of the Wasserstein distance -- in\nparticular, the triangle inequality -- and that of the associated {\\em\ngeodesics}: our algorithm, FedWad (for Federated Wasserstein Distance),\niteratively approximates the Wasserstein distance by manipulating and\nexchanging distributions from the space of geodesics in lieu of the input\nsamples. In addition to establishing the convergence properties of FedWad, we\nprovide empirical results on federated coresets and federate optimal transport\ndataset distance, that we respectively exploit for building a novel federated\nmodel and for boosting performance of popular federated learning algorithms.\n","authors":["Alain Rakotomamonjy","Kimia Nadjahi","Liva Ralaivola"],"pdf_url":"https://arxiv.org/pdf/2310.01973v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2310.01972v1","updated":"2023-10-03T11:28:54Z","published":"2023-10-03T11:28:54Z","title":"Epidemic Learning: Boosting Decentralized Learning with Randomized\n  Communication","summary":"  We present Epidemic Learning (EL), a simple yet powerful decentralized\nlearning (DL) algorithm that leverages changing communication topologies to\nachieve faster model convergence compared to conventional DL approaches. At\neach round of EL, each node sends its model updates to a random sample of $s$\nother nodes (in a system of $n$ nodes). We provide an extensive theoretical\nanalysis of EL, demonstrating that its changing topology culminates in superior\nconvergence properties compared to the state-of-the-art (static and dynamic)\ntopologies. Considering smooth non-convex loss functions, the number of\ntransient iterations for EL, i.e., the rounds required to achieve asymptotic\nlinear speedup, is in $\\mathcal{O}(\\frac{n^3}{s^2})$ which outperforms the\nbest-known bound $\\mathcal{O}({n^3})$ by a factor of $ s^2 $, indicating the\nbenefit of randomized communication for DL. We empirically evaluate EL in a\n96-node network and compare its performance with state-of-the-art DL\napproaches. Our results illustrate that EL converges up to $ 1.6\\times $\nquicker than baseline DL algorithms and attains 1.8% higher accuracy for the\nsame communication volume.\n","authors":["Martijn de Vos","Sadegh Farhadkhani","Rachid Guerraoui","Anne-Marie Kermarrec","Rafael Pires","Rishi Sharma"],"pdf_url":"https://arxiv.org/pdf/2310.01972v1.pdf","comment":"Accepted paper at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.01959v1","updated":"2023-10-03T11:10:21Z","published":"2023-10-03T11:10:21Z","title":"Beyond Labeling Oracles: What does it mean to steal ML models?","summary":"  Model extraction attacks are designed to steal trained models with only query\naccess, as is often provided through APIs that ML-as-a-Service providers offer.\nML models are expensive to train, in part because data is hard to obtain, and a\nprimary incentive for model extraction is to acquire a model while incurring\nless cost than training from scratch. Literature on model extraction commonly\nclaims or presumes that the attacker is able to save on both data acquisition\nand labeling costs. We show that the attacker often does not. This is because\ncurrent attacks implicitly rely on the adversary being able to sample from the\nvictim model's data distribution. We thoroughly evaluate factors influencing\nthe success of model extraction. We discover that prior knowledge of the\nattacker, i.e. access to in-distribution data, dominates other factors like the\nattack policy the adversary follows to choose which queries to make to the\nvictim model API. Thus, an adversary looking to develop an equally capable\nmodel with a fixed budget has little practical incentive to perform model\nextraction, since for the attack to work they need to collect in-distribution\ndata, saving only on the cost of labeling. With low labeling costs in the\ncurrent market, the usefulness of such attacks is questionable. Ultimately, we\ndemonstrate that the effect of prior knowledge needs to be explicitly decoupled\nfrom the attack policy. To this end, we propose a benchmark to evaluate attack\npolicy directly.\n","authors":["Avital Shafran","Ilia Shumailov","Murat A. Erdogdu","Nicolas Papernot"],"pdf_url":"https://arxiv.org/pdf/2310.01959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.01145v3","updated":"2023-10-03T11:10:08Z","published":"2022-07-04T00:09:33Z","title":"Memory Population in Continual Learning via Outlier Elimination","summary":"  Catastrophic forgetting, the phenomenon of forgetting previously learned\ntasks when learning a new one, is a major hurdle in developing continual\nlearning algorithms. A popular method to alleviate forgetting is to use a\nmemory buffer, which stores a subset of previously learned task examples for\nuse during training on new tasks. The de facto method of filling memory is by\nrandomly selecting previous examples. However, this process could introduce\noutliers or noisy samples that could hurt the generalization of the model. This\npaper introduces Memory Outlier Elimination (MOE), a method for identifying and\neliminating outliers in the memory buffer by choosing samples from\nlabel-homogeneous subpopulations. We show that a space with a high homogeneity\nis related to a feature space that is more representative of the class\ndistribution. In practice, MOE removes a sample if it is surrounded by samples\nfrom different labels. We demonstrate the effectiveness of MOE on CIFAR-10,\nCIFAR-100, and CORe50, outperforming previous well-known memory population\nmethods.\n","authors":["Julio Hurtado","Alain Raymond-Saez","Vladimir Araujo","Vincenzo Lomonaco","Alvaro Soto","Davide Bacciu"],"pdf_url":"https://arxiv.org/pdf/2207.01145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08822v2","updated":"2023-10-03T11:02:21Z","published":"2023-07-17T20:31:41Z","title":"A Meta-Learning Based Precoder Optimization Framework for Rate-Splitting\n  Multiple Access","summary":"  In this letter, we propose the use of a meta-learning based precoder\noptimization framework to directly optimize the Rate-Splitting Multiple Access\n(RSMA) precoders with partial Channel State Information at the Transmitter\n(CSIT). By exploiting the overfitting of the compact neural network to maximize\nthe explicit Average Sum-Rate (ASR) expression, we effectively bypass the need\nfor any other training data while minimizing the total running time. Numerical\nresults reveal that the meta-learning based solution achieves similar ASR\nperformance to conventional precoder optimization in medium-scale scenarios,\nand significantly outperforms sub-optimal low complexity precoder algorithms in\nthe large-scale regime.\n","authors":["Rafael Cerna Loli","Bruno Clerckx"],"pdf_url":"https://arxiv.org/pdf/2307.08822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10616v4","updated":"2023-10-03T11:01:55Z","published":"2023-05-18T00:04:38Z","title":"Evaluation Metrics for DNNs Compression","summary":"  There is a lot of ongoing research effort into developing different\ntechniques for neural networks compression. However, the community lacks\nstandardised evaluation metrics, which are key to identifying the most suitable\ncompression technique for different applications. This paper reviews existing\nneural network compression evaluation metrics and implements them into a\nstandardisation framework called NetZIP. We introduce two novel metrics to\ncover existing gaps of evaluation in the literature: 1) Compression and\nHardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success\n(OCS). We demonstrate the use of NetZIP using two case studies on two different\nhardware platforms (a PC and a Raspberry Pi 4) focusing on object\nclassification and object detection.\n","authors":["Abanoub Ghobrial","Samuel Budgett","Dieter Balemans","Hamid Asgari","Phil Reiter","Kerstin Eder"],"pdf_url":"https://arxiv.org/pdf/2305.10616v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01951v1","updated":"2023-10-03T10:52:21Z","published":"2023-10-03T10:52:21Z","title":"Probabilistic Reach-Avoid for Bayesian Neural Networks","summary":"  Model-based reinforcement learning seeks to simultaneously learn the dynamics\nof an unknown stochastic environment and synthesise an optimal policy for\nacting in it. Ensuring the safety and robustness of sequential decisions made\nthrough a policy in such an environment is a key challenge for policies\nintended for safety-critical scenarios. In this work, we investigate two\ncomplementary problems: first, computing reach-avoid probabilities for\niterative predictions made with dynamical models, with dynamics described by\nBayesian neural network (BNN); second, synthesising control policies that are\noptimal with respect to a given reach-avoid specification (reaching a \"target\"\nstate, while avoiding a set of \"unsafe\" states) and a learned BNN model. Our\nsolution leverages interval propagation and backward recursion techniques to\ncompute lower bounds for the probability that a policy's sequence of actions\nleads to satisfying the reach-avoid specification. Such computed lower bounds\nprovide safety certification for the given policy and BNN model. We then\nintroduce control synthesis algorithms to derive policies maximizing said lower\nbounds on the safety probability. We demonstrate the effectiveness of our\nmethod on a series of control benchmarks characterized by learned BNN dynamics\nmodels. On our most challenging benchmark, compared to purely data-driven\npolicies the optimal synthesis algorithm is able to provide more than a\nfour-fold increase in the number of certifiable states and more than a\nthree-fold increase in the average guaranteed reach-avoid probability.\n","authors":["Matthew Wicker","Luca Laurenti","Andrea Patane","Nicola Paoletti","Alessandro Abate","Marta Kwiatkowska"],"pdf_url":"https://arxiv.org/pdf/2310.01951v1.pdf","comment":"47 pages, 10 figures. arXiv admin note: text overlap with\n  arXiv:2105.10134"},{"id":"http://arxiv.org/abs/2310.01942v1","updated":"2023-10-03T10:38:39Z","published":"2023-10-03T10:38:39Z","title":"OOD Aware Supervised Contrastive Learning","summary":"  Out-of-Distribution (OOD) detection is a crucial problem for the safe\ndeployment of machine learning models identifying samples that fall outside of\nthe training distribution, i.e. in-distribution data (ID). Most OOD works focus\non the classification models trained with Cross Entropy (CE) and attempt to fix\nits inherent issues. In this work we leverage powerful representation learned\nwith Supervised Contrastive (SupCon) training and propose a holistic approach\nto learn a classifier robust to OOD data. We extend SupCon loss with two\nadditional contrast terms. The first term pushes auxiliary OOD representations\naway from ID representations without imposing any constraints on similarities\namong auxiliary data. The second term pushes OOD features far from the existing\nclass prototypes, while pushing ID representations closer to their\ncorresponding class prototype. When auxiliary OOD data is not available, we\npropose feature mixing techniques to efficiently generate pseudo-OOD features.\nOur solution is simple and efficient and acts as a natural extension of the\nclosed-set supervised contrastive representation learning. We compare against\ndifferent OOD detection methods on the common benchmarks and show\nstate-of-the-art results.\n","authors":["Soroush Seifi","Daniel Olmeda Reino","Nikolay Chumerin","Rahaf Aljundi"],"pdf_url":"https://arxiv.org/pdf/2310.01942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11328v4","updated":"2023-10-03T10:35:05Z","published":"2023-04-22T06:06:28Z","title":"On Accelerating Diffusion-Based Sampling Process via Improved\n  Integration Approximation","summary":"  A popular approach to sample a diffusion-based generative model is to solve\nan ordinary differential equation (ODE). In existing samplers, the coefficients\nof the ODE solvers are pre-determined by the ODE formulation, the reverse\ndiscrete timesteps, and the employed ODE methods. In this paper, we consider\naccelerating several popular ODE-based sampling processes (including EDM, DDIM,\nand DPM-Solver) by optimizing certain coefficients via improved integration\napproximation (IIA). We propose to minimize, for each time step, a mean squared\nerror (MSE) function with respect to the selected coefficients. The MSE is\nconstructed by applying the original ODE solver for a set of fine-grained\ntimesteps, which in principle provides a more accurate integration\napproximation in predicting the next diffusion state. The proposed IIA\ntechnique does not require any change of a pre-trained model, and only\nintroduces a very small computational overhead for solving a number of\nquadratic optimization problems. Extensive experiments show that considerably\nbetter FID scores can be achieved by using IIA-EDM, IIA-DDIM, and\nIIA-DPM-Solver than the original counterparts when the neural function\nevaluation (NFE) is small (i.e., less than 25).\n","authors":["Guoqiang Zhang","Niwa Kenta","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2304.11328v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01937v1","updated":"2023-10-03T10:24:44Z","published":"2023-10-03T10:24:44Z","title":"Causal Inference with Conditional Front-Door Adjustment and Identifiable\n  Variational Autoencoder","summary":"  An essential and challenging problem in causal inference is causal effect\nestimation from observational data. The problem becomes more difficult with the\npresence of unobserved confounding variables. The front-door adjustment is a\npractical approach for dealing with unobserved confounding variables. However,\nthe restriction for the standard front-door adjustment is difficult to satisfy\nin practice. In this paper, we relax some of the restrictions by proposing the\nconcept of conditional front-door (CFD) adjustment and develop the theorem that\nguarantees the causal effect identifiability of CFD adjustment. Furthermore, as\nit is often impossible for a CFD variable to be given in practice, it is\ndesirable to learn it from data. By leveraging the ability of deep generative\nmodels, we propose CFDiVAE to learn the representation of the CFD adjustment\nvariable directly from data with the identifiable Variational AutoEncoder and\nformally prove the model identifiability. Extensive experiments on synthetic\ndatasets validate the effectiveness of CFDiVAE and its superiority over\nexisting methods. The experiments also show that the performance of CFDiVAE is\nless sensitive to the causal strength of unobserved confounding variables. We\nfurther apply CFDiVAE to a real-world dataset to demonstrate its potential\napplication.\n","authors":["Ziqi Xu","Debo Cheng","Jiuyong Li","Jixue Liu","Lin Liu","Kui Yu"],"pdf_url":"https://arxiv.org/pdf/2310.01937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01929v1","updated":"2023-10-03T10:13:36Z","published":"2023-10-03T10:13:36Z","title":"Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of\n  Text-To-Image Models","summary":"  Text-To-Image (TTI) models, exemplified by DALL-E and StableDiffusion, have\nrecently gained prominence for their remarkable zero-shot capabilities in\ngenerating images guided by textual prompts. Language, as a conduit of culture,\nplays a pivotal role in these models' multilingual capabilities, which in turn\nshape their cultural agency. In this study, we explore the cultural perception\nembedded in TTI models by characterizing culture across three hierarchical\ntiers: cultural dimensions, cultural domains, and cultural concepts. We propose\na comprehensive suite of evaluation techniques, including intrinsic evaluations\nusing the CLIP space, extrinsic evaluations with a Visual-Question-Answer (VQA)\nmodel, and human assessments, to discern TTI cultural perceptions. To\nfacilitate our research, we introduce the CulText2I dataset, derived from four\ndiverse TTI models and spanning ten languages. Our experiments reveal insights\ninto these models' cultural awareness, cultural distinctions, and the unlocking\nof cultural features, releasing the potential for cross-cultural applications.\n","authors":["Mor Ventura","Eyal Ben-David","Anna Korhonen","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.01929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01924v1","updated":"2023-10-03T09:59:59Z","published":"2023-10-03T09:59:59Z","title":"RoFormer for Position Aware Multiple Instance Learning in Whole Slide\n  Image Classification","summary":"  Whole slide image (WSI) classification is a critical task in computational\npathology. However, the gigapixel-size of such images remains a major challenge\nfor the current state of deep-learning. Current methods rely on\nmultiple-instance learning (MIL) models with frozen feature extractors. Given\nthe the high number of instances in each image, MIL methods have long assumed\nindependence and permutation-invariance of patches, disregarding the tissue\nstructure and correlation between patches. Recent works started studying this\ncorrelation between instances but the computational workload of such a high\nnumber of tokens remained a limiting factor. In particular, relative position\nof patches remains unaddressed. We propose to apply a straightforward encoding\nmodule, namely a RoFormer layer , relying on memory-efficient exact\nself-attention and relative positional encoding. This module can perform full\nself-attention with relative position encoding on patches of large and\narbitrary shaped WSIs, solving the need for correlation between instances and\nspatial modeling of tissues. We demonstrate that our method outperforms\nstate-of-the-art MIL models on three commonly used public datasets (TCGA-NSCLC,\nBRACS and Camelyon16)) on weakly supervised classification tasks. Code is\navailable at https://github.com/Sanofi-Public/DDS-RoFormerMIL\n","authors":["Etienne Pochet","Rami Maroun","Roger Trullo"],"pdf_url":"https://arxiv.org/pdf/2310.01924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01181v2","updated":"2023-10-03T09:42:23Z","published":"2023-10-02T13:19:35Z","title":"Graph Isomorphic Networks for Assessing Reliability of the\n  Medium-Voltage Grid","summary":"  Ensuring electricity grid reliability becomes increasingly challenging with\nthe shift towards renewable energy and declining conventional capacities.\nDistribution System Operators (DSOs) aim to achieve grid reliability by\nverifying the n-1 principle, ensuring continuous operation in case of component\nfailure. Electricity networks' complex graph-based data holds crucial\ninformation for n-1 assessment: graph structure and data about stations/cables.\nUnlike traditional machine learning methods, Graph Neural Networks (GNNs)\ndirectly handle graph-structured data. This paper proposes using Graph\nIsomorphic Networks (GINs) for n-1 assessments in medium voltage grids. The GIN\nframework is designed to generalise to unseen grids and utilise graph structure\nand data about stations/cables. The proposed GIN approach demonstrates faster\nand more reliable grid assessments than a traditional mathematical optimisation\napproach, reducing prediction times by approximately a factor of 1000. The\nfindings offer a promising approach to address computational challenges and\nenhance the reliability and efficiency of energy grid assessments.\n","authors":["Charlotte Cambier van Nooten","Tom van de Poll","Sonja Füllhase","Jacco Heres","Tom Heskes","Yuliya Shapovalova"],"pdf_url":"https://arxiv.org/pdf/2310.01181v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.13057v3","updated":"2023-10-03T09:41:29Z","published":"2023-05-22T14:14:43Z","title":"Causality-Aided Trade-off Analysis for Machine Learning Fairness","summary":"  There has been an increasing interest in enhancing the fairness of machine\nlearning (ML). Despite the growing number of fairness-improving methods, we\nlack a systematic understanding of the trade-offs among factors considered in\nthe ML pipeline when fairness-improving methods are applied. This understanding\nis essential for developers to make informed decisions regarding the provision\nof fair ML services. Nonetheless, it is extremely difficult to analyze the\ntrade-offs when there are multiple fairness parameters and other crucial\nmetrics involved, coupled, and even in conflict with one another.\n  This paper uses causality analysis as a principled method for analyzing\ntrade-offs between fairness parameters and other crucial metrics in ML\npipelines. To ractically and effectively conduct causality analysis, we propose\na set of domain-specific optimizations to facilitate accurate causal discovery\nand a unified, novel interface for trade-off analysis based on well-established\ncausal inference methods. We conduct a comprehensive empirical study using\nthree real-world datasets on a collection of widelyused fairness-improving\ntechniques. Our study obtains actionable suggestions for users and developers\nof fair ML. We further demonstrate the versatile usage of our approach in\nselecting the optimal fairness-improving method, paving the way for more\nethical and socially responsible AI technologies.\n","authors":["Zhenlan Ji","Pingchuan Ma","Shuai Wang","Yanhui Li"],"pdf_url":"https://arxiv.org/pdf/2305.13057v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01912v1","updated":"2023-10-03T09:35:38Z","published":"2023-10-03T09:35:38Z","title":"Improved Automatic Diabetic Retinopathy Severity Classification Using\n  Deep Multimodal Fusion of UWF-CFP and OCTA Images","summary":"  Diabetic Retinopathy (DR), a prevalent and severe complication of diabetes,\naffects millions of individuals globally, underscoring the need for accurate\nand timely diagnosis. Recent advancements in imaging technologies, such as\nUltra-WideField Color Fundus Photography (UWF-CFP) imaging and Optical\nCoherence Tomography Angiography (OCTA), provide opportunities for the early\ndetection of DR but also pose significant challenges given the disparate nature\nof the data they produce. This study introduces a novel multimodal approach\nthat leverages these imaging modalities to notably enhance DR classification.\nOur approach integrates 2D UWF-CFP images and 3D high-resolution 6x6 mm$^3$\nOCTA (both structure and flow) images using a fusion of ResNet50 and\n3D-ResNet50 models, with Squeeze-and-Excitation (SE) blocks to amplify relevant\nfeatures. Additionally, to increase the model's generalization capabilities, a\nmultimodal extension of Manifold Mixup, applied to concatenated multimodal\nfeatures, is implemented. Experimental results demonstrate a remarkable\nenhancement in DR classification performance with the proposed multimodal\napproach compared to methods relying on a single modality only. The methodology\nlaid out in this work holds substantial promise for facilitating more accurate,\nearly detection of DR, potentially improving clinical outcomes for patients.\n","authors":["Mostafa El Habib Daho","Yihao Li","Rachid Zeghlache","Yapo Cedric Atse","Hugo Le Boité","Sophie Bonnin","Deborah Cosette","Pierre Deman","Laurent Borderie","Capucine Lepicard","Ramin Tadayoni","Béatrice Cochener","Pierre-Henri Conze","Mathieu Lamard","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2310.01912v1.pdf","comment":"Accepted preprint for presentation at MICCAI-OMIA 20023, Vancouver,\n  Canada"},{"id":"http://arxiv.org/abs/2306.13575v3","updated":"2023-10-03T09:35:23Z","published":"2023-06-23T15:55:44Z","title":"Scaling MLPs: A Tale of Inductive Bias","summary":"  In this work we revisit the most fundamental building block in deep learning,\nthe multi-layer perceptron (MLP), and study the limits of its performance on\nvision tasks. Empirical insights into MLPs are important for multiple reasons.\n(1) Given the recent narrative \"less inductive bias is better\", popularized due\nto transformers eclipsing convolutional models, it is natural to explore the\nlimits of this hypothesis. To that end, MLPs offer an ideal test bed, as they\nlack any vision-specific inductive bias. (2) MLPs have almost exclusively been\nthe main protagonist in the deep learning theory literature due to their\nmathematical simplicity, serving as a proxy to explain empirical phenomena\nobserved for more complex architectures. Surprisingly, experimental datapoints\nfor MLPs are very difficult to find in the literature, especially when coupled\nwith large pre-training protocols. This discrepancy between practice and theory\nis worrying: Do MLPs reflect the empirical advances exhibited by practical\nmodels? Or do theorists need to rethink the role of MLPs as a proxy? We provide\ninsights into both these aspects. We show that the performance of MLPs\ndrastically improves with scale (95% on CIFAR10, 82% on CIFAR100, 58% on\nImageNet ReaL), highlighting that lack of inductive bias can indeed be\ncompensated. We observe that MLPs mimic the behaviour of their modern\ncounterparts faithfully, with some components in the learning setting however\nexhibiting stronger or unexpected behaviours. Due to their inherent\ncomputational efficiency, large pre-training experiments become more accessible\nfor academic researchers. All of our experiments were run on a single GPU.\n","authors":["Gregor Bachmann","Sotiris Anagnostidis","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2306.13575v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01904v1","updated":"2023-10-03T09:22:06Z","published":"2023-10-03T09:22:06Z","title":"Beyond the Benchmark: Detecting Diverse Anomalies in Videos","summary":"  Video Anomaly Detection (VAD) plays a crucial role in modern surveillance\nsystems, aiming to identify various anomalies in real-world situations.\nHowever, current benchmark datasets predominantly emphasize simple,\nsingle-frame anomalies such as novel object detection. This narrow focus\nrestricts the advancement of VAD models. In this research, we advocate for an\nexpansion of VAD investigations to encompass intricate anomalies that extend\nbeyond conventional benchmark boundaries. To facilitate this, we introduce two\ndatasets, HMDB-AD and HMDB-Violence, to challenge models with diverse\naction-based anomalies. These datasets are derived from the HMDB51 action\nrecognition dataset. We further present Multi-Frame Anomaly Detection (MFAD), a\nnovel method built upon the AI-VAD framework. AI-VAD utilizes single-frame\nfeatures such as pose estimation and deep image encoding, and two-frame\nfeatures such as object velocity. They then apply a density estimation\nalgorithm to compute anomaly scores. To address complex multi-frame anomalies,\nwe add a deep video encoding features capturing long-range temporal\ndependencies, and logistic regression to enhance final score calculation.\nExperimental results confirm our assumptions, highlighting existing models\nlimitations with new anomaly types. MFAD excels in both simple and complex\nanomaly detection scenarios.\n","authors":["Yoav Arad","Michael Werman"],"pdf_url":"https://arxiv.org/pdf/2310.01904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11298v2","updated":"2023-10-03T08:55:59Z","published":"2022-11-21T09:41:18Z","title":"Exploring Physical Latent Spaces for High-Resolution Flow Restoration","summary":"  We explore training deep neural network models in conjunction with physics\nsimulations via partial differential equations (PDEs), using the simulated\ndegrees of freedom as latent space for a neural network. In contrast to\nprevious work, this paper treats the degrees of freedom of the simulated space\npurely as tools to be used by the neural network. We demonstrate this concept\nfor learning reduced representations, as it is extremely challenging to\nfaithfully preserve correct solutions over long time-spans with traditional\nreduced representations, particularly for solutions with large amounts of small\nscale features. This work focuses on the use of such physical, reduced latent\nspace for the restoration of fine simulations, by training models that can\nmodify the content of the reduced physical states as much as needed to best\nsatisfy the learning objective. This autonomy allows the neural networks to\ndiscover alternate dynamics that significantly improve the performance in the\ngiven tasks. We demonstrate this concept for various fluid flows ranging from\ndifferent turbulence scenarios to rising smoke plumes.\n","authors":["Chloe Paliard","Nils Thuerey","Kiwon Um"],"pdf_url":"https://arxiv.org/pdf/2211.11298v2.pdf","comment":"20 pages, 18 figures"},{"id":"http://arxiv.org/abs/2310.01892v1","updated":"2023-10-03T08:54:06Z","published":"2023-10-03T08:54:06Z","title":"FiGURe: Simple and Efficient Unsupervised Node Representations with\n  Filter Augmentations","summary":"  Unsupervised node representations learnt using contrastive learning-based\nmethods have shown good performance on downstream tasks. However, these methods\nrely on augmentations that mimic low-pass filters, limiting their performance\non tasks requiring different eigen-spectrum parts. This paper presents a simple\nfilter-based augmentation method to capture different parts of the\neigen-spectrum. We show significant improvements using these augmentations.\nFurther, we show that sharing the same weights across these different filter\naugmentations is possible, reducing the computational load. In addition,\nprevious works have shown that good performance on downstream tasks requires\nhigh dimensional representations. Working with high dimensions increases the\ncomputations, especially when multiple augmentations are involved. We mitigate\nthis problem and recover good performance through lower dimensional embeddings\nusing simple random Fourier feature projections. Our method, FiGURe achieves an\naverage gain of up to 4.4\\%, compared to the state-of-the-art unsupervised\nmodels, across all datasets in consideration, both homophilic and heterophilic.\nOur code can be found at: https://github.com/microsoft/figure.\n","authors":["Chanakya Ekbote","Ajinkya Pankaj Deshpande","Arun Iyer","Ramakrishna Bairi","Sundararajan Sellamanickam"],"pdf_url":"https://arxiv.org/pdf/2310.01892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15289v2","updated":"2023-10-03T08:52:42Z","published":"2023-09-26T21:56:03Z","title":"SEPT: Towards Efficient Scene Representation Learning for Motion\n  Prediction","summary":"  Motion prediction is crucial for autonomous vehicles to operate safely in\ncomplex traffic environments. Extracting effective spatiotemporal relationships\namong traffic elements is key to accurate forecasting. Inspired by the\nsuccessful practice of pretrained large language models, this paper presents\nSEPT, a modeling framework that leverages self-supervised learning to develop\npowerful spatiotemporal understanding for complex traffic scenes. Specifically,\nour approach involves three masking-reconstruction modeling tasks on scene\ninputs including agents' trajectories and road network, pretraining the scene\nencoder to capture kinematics within trajectory, spatial structure of road\nnetwork, and interactions among roads and agents. The pretrained encoder is\nthen finetuned on the downstream forecasting task. Extensive experiments\ndemonstrate that SEPT, without elaborate architectural design or manual feature\nengineering, achieves state-of-the-art performance on the Argoverse 1 and\nArgoverse 2 motion forecasting benchmarks, outperforming previous methods on\nall main metrics by a large margin.\n","authors":["Zhiqian Lan","Yuxuan Jiang","Yao Mu","Chen Chen","Shengbo Eben Li","Hang Zhao","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2309.15289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01886v1","updated":"2023-10-03T08:39:33Z","published":"2023-10-03T08:39:33Z","title":"Effective and Parameter-Efficient Reusing Fine-Tuned Models","summary":"  Many pre-trained large-scale models provided online have become highly\neffective in transferring to downstream tasks. At the same time, various\ntask-specific models fine-tuned on these pre-trained models are available\nonline for public use. In practice, as collecting task-specific data is\nlabor-intensive and fine-tuning the large pre-trained models is computationally\nexpensive, one can reuse task-specific finetuned models to deal with downstream\ntasks. However, using a model per task causes a heavy burden on storage and\nserving. Recently, many training-free and parameter-efficient methods have been\nproposed for reusing multiple fine-tuned task-specific models into a single\nmulti-task model. However, these methods exhibit a large accuracy gap compared\nwith using a fine-tuned model per task. In this paper, we propose\nParameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing\nFully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task\nvector into a merged model by magnitude pruning. For reusing LoRA fine-tuned\nmodels, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA\nmatrix by singular value decomposition. Both PERUFFT and PERU-LoRA are\ntraining-free. Extensive experiments conducted on computer vision and natural\nlanguage process tasks demonstrate the effectiveness and parameter-efficiency\nof the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform\nexisting reusing model methods by a large margin and achieve comparable\nperformance to using a fine-tuned model per task.\n","authors":["Weisen Jiang","Baijiong Lin","Han Shi","Yu Zhang","and Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.01886v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2310.01405v2","updated":"2023-10-03T08:39:09Z","published":"2023-10-02T17:59:07Z","title":"Representation Engineering: A Top-Down Approach to AI Transparency","summary":"  In this paper, we identify and characterize the emerging area of\nrepresentation engineering (RepE), an approach to enhancing the transparency of\nAI systems that draws on insights from cognitive neuroscience. RepE places\npopulation-level representations, rather than neurons or circuits, at the\ncenter of analysis, equipping us with novel methods for monitoring and\nmanipulating high-level cognitive phenomena in deep neural networks (DNNs). We\nprovide baselines and an initial analysis of RepE techniques, showing that they\noffer simple yet effective solutions for improving our understanding and\ncontrol of large language models. We showcase how these methods can provide\ntraction on a wide range of safety-relevant problems, including honesty,\nharmlessness, power-seeking, and more, demonstrating the promise of top-down\ntransparency research. We hope that this work catalyzes further exploration of\nRepE and fosters advancements in the transparency and safety of AI systems.\n","authors":["Andy Zou","Long Phan","Sarah Chen","James Campbell","Phillip Guo","Richard Ren","Alexander Pan","Xuwang Yin","Mantas Mazeika","Ann-Kathrin Dombrowski","Shashwat Goel","Nathaniel Li","Michael J. Byun","Zifan Wang","Alex Mallen","Steven Basart","Sanmi Koyejo","Dawn Song","Matt Fredrikson","J. Zico Kolter","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2310.01405v2.pdf","comment":"Code is available at\n  https://github.com/andyzoujm/representation-engineering"},{"id":"http://arxiv.org/abs/2310.01885v1","updated":"2023-10-03T08:38:52Z","published":"2023-10-03T08:38:52Z","title":"Synthetic CT Generation via Variant Invertible Network for All-digital\n  Brain PET Attenuation Correction","summary":"  Attenuation correction (AC) is essential for the generation of artifact-free\nand quantitatively accurate positron emission tomography (PET) images. However,\nAC of PET faces challenges including inter-scan motion and erroneous\ntransformation of structural voxel-intensities to PET attenuation-correction\nfactors. Nowadays, the problem of AC for quantitative PET have been solved to a\nlarge extent after the commercial availability of devices combining PET with\ncomputed tomography (CT). Meanwhile, considering the feasibility of a deep\nlearning approach for PET AC without anatomical imaging, this paper develops a\nPET AC method, which uses deep learning to generate continuously valued CT\nimages from non-attenuation corrected PET images for AC on brain PET imaging.\nSpecifically, an invertible network combined with the variable augmentation\nstrategy that can achieve the bidirectional inference processes is proposed for\nsynthetic CT generation (IVNAC). To evaluate the performance of the proposed\nalgorithm, we conducted a comprehensive study on a total of 1440 data from 37\nclinical patients using comparative algorithms (such as Cycle-GAN and Pix2pix).\nPerceptual analysis and quantitative evaluations illustrate that the invertible\nnetwork for PET AC outperforms other existing AC models, which demonstrates the\npotential of the proposed method and the feasibility of achieving brain PET AC\nwithout CT.\n","authors":["Yu Guan","Bohui Shen","Xinchong Shi","Xiangsong Zhang","Bingxuan Li","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2310.01885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01884v1","updated":"2023-10-03T08:37:21Z","published":"2023-10-03T08:37:21Z","title":"Adaptive Hybrid Model for Enhanced Stock Market Predictions Using\n  Improved VMD and Stacked Informer","summary":"  This paper introduces an innovative adaptive hybrid model for stock market\npredictions, leveraging the capabilities of an enhanced Variational Mode\nDecomposition (VMD), Feature Engineering (FE), and stacked Informer integrated\nwith an adaptive loss function. Through rigorous experimentation, the proposed\nmodel, termed Adam+GC+enhanced informer (We name it VMGCformer), demonstrates\nsignificant proficiency in addressing the intricate dynamics and volatile\nnature of stock market data. Experimental results, derived from multiple\nbenchmark datasets, underscore the model's superiority in terms of prediction\naccuracy, responsiveness, and generalization capabilities over traditional and\nother hybrid models. The research further highlights potential avenues for\noptimization and introduces future directions to enhance predictive modeling,\nespecially for small enterprises and feature engineering.\n","authors":["Jianan Zhang","Hongyi Duan"],"pdf_url":"https://arxiv.org/pdf/2310.01884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01880v1","updated":"2023-10-03T08:34:44Z","published":"2023-10-03T08:34:44Z","title":"AutoCast++: Enhancing World Event Prediction with Zero-shot\n  Ranking-based Context Retrieval","summary":"  Machine-based prediction of real-world events is garnering attention due to\nits potential for informed decision-making. Whereas traditional forecasting\npredominantly hinges on structured data like time-series, recent breakthroughs\nin language models enable predictions using unstructured text. In particular,\n(Zou et al., 2022) unveils AutoCast, a new benchmark that employs news articles\nfor answering forecasting queries. Nevertheless, existing methods still trail\nbehind human performance. The cornerstone of accurate forecasting, we argue,\nlies in identifying a concise, yet rich subset of news snippets from a vast\ncorpus. With this motivation, we introduce AutoCast++, a zero-shot\nranking-based context retrieval system, tailored to sift through expansive news\ndocument collections for event forecasting. Our approach first re-ranks\narticles based on zero-shot question-passage relevance, honing in on\nsemantically pertinent news. Following this, the chosen articles are subjected\nto zero-shot summarization to attain succinct context. Leveraging a pre-trained\nlanguage model, we conduct both the relevance evaluation and article\nsummarization without needing domain-specific training. Notably, recent\narticles can sometimes be at odds with preceding ones due to new facts or\nunanticipated incidents, leading to fluctuating temporal dynamics. To tackle\nthis, our re-ranking mechanism gives preference to more recent articles, and we\nfurther regularize the multi-passage representation learning to align with\nhuman forecaster responses made on different dates. Empirical results\nunderscore marked improvements across multiple metrics, improving the\nperformance for multiple-choice questions (MCQ) by 48% and true/false (TF)\nquestions by up to 8%.\n","authors":["Qi Yan","Raihan Seraj","Jiawei He","Lili Meng","Tristan Sylvain"],"pdf_url":"https://arxiv.org/pdf/2310.01880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01875v1","updated":"2023-10-03T08:25:32Z","published":"2023-10-03T08:25:32Z","title":"Towards Stable Backdoor Purification through Feature Shift Tuning","summary":"  It has been widely observed that deep neural networks (DNN) are vulnerable to\nbackdoor attacks where attackers could manipulate the model behavior\nmaliciously by tampering with a small set of training samples. Although a line\nof defense methods is proposed to mitigate this threat, they either require\ncomplicated modifications to the training process or heavily rely on the\nspecific model architecture, which makes them hard to deploy into real-world\napplications. Therefore, in this paper, we instead start with fine-tuning, one\nof the most common and easy-to-deploy backdoor defenses, through comprehensive\nevaluations against diverse attack scenarios. Observations made through initial\nexperiments show that in contrast to the promising defensive results on high\npoisoning rates, vanilla tuning methods completely fail at low poisoning rate\nscenarios. Our analysis shows that with the low poisoning rate, the\nentanglement between backdoor and clean features undermines the effect of\ntuning-based defenses. Therefore, it is necessary to disentangle the backdoor\nand clean features in order to improve backdoor purification. To address this,\nwe introduce Feature Shift Tuning (FST), a method for tuning-based backdoor\npurification. Specifically, FST encourages feature shifts by actively deviating\nthe classifier weights from the originally compromised weights. Extensive\nexperiments demonstrate that our FST provides consistently stable performance\nunder different attack settings. Additionally, it is also convenient to deploy\nin real-world scenarios with significantly reduced computation costs. Our codes\nare available at\n\\url{https://github.com/AISafety-HKUST/stable_backdoor_purification}.\n","authors":["Rui Min","Zeyu Qin","Li Shen","Minhao Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.01875v1.pdf","comment":"NeurIPS 2023 paper. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2106.02626v4","updated":"2023-10-03T08:25:16Z","published":"2021-06-04T17:39:36Z","title":"Dynamics of specialization in neural modules under resource constraints","summary":"  It has long been believed that the brain is highly modular both in terms of\nstructure and function, although recent evidence has led some to question the\nextent of both types of modularity. We used artificial neural networks to test\nthe hypothesis that structural modularity is sufficient to guarantee functional\nspecialization, and find that in general, this doesn't necessarily hold except\nat extreme levels. We then systematically tested which features of the\nenvironment and network do lead to the emergence of specialization. We used a\nsimple toy environment, task and network, allowing us precise control, and show\nthat in this setup, several distinct measures of specialization give\nqualitatively similar results. We further find that (1) specialization can only\nemerge in environments where features of that environment are meaningfully\nseparable, (2) specialization preferentially emerges when the network is\nstrongly resource-constrained, and (3) these findings are qualitatively similar\nacross different network architectures, but the quantitative relationships\ndepends on the architecture type. Finally, we show that functional\nspecialization varies dynamically across time, and demonstrate that these\ndynamics depend on both the timing and bandwidth of information flow in the\nnetwork. We conclude that a static notion of specialization, based on\nstructural modularity, is likely too simple a framework for understanding\nintelligence in situations of real-world complexity, from biology to\nbrain-inspired neuromorphic systems. We propose that thoroughly stress testing\ncandidate definitions of functional modularity in simplified scenarios before\nextending to more complex data, network models and electrophysiological\nrecordings is likely to be a fruitful approach.\n","authors":["Gabriel Béna","Dan F. M. Goodman"],"pdf_url":"https://arxiv.org/pdf/2106.02626v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12375v3","updated":"2023-10-03T08:19:44Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning Learns Label Relationships but Is Not Conventional\n  Learning","summary":"  The predictions of Large Language Models (LLMs) on downstream tasks often\nimprove significantly when including examples of the input--label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works. For example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we provide novel insights into how ICL leverages label information,\nrevealing both capabilities and limitations. To ensure we obtain a\ncomprehensive picture of ICL behavior, we study probabilistic aspects of ICL\npredictions and thoroughly examine the dynamics of ICL as more examples are\nprovided. Our experiments show that ICL predictions almost always depend on\nin-context labels, and that ICL can learn truly novel tasks in-context.\nHowever, we also find that ICL struggles to fully overcome prediction\npreferences acquired from pre-training data, and, further, that ICL does not\nconsider all in-context information equally.\n","authors":["Jannik Kossen","Yarin Gal","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2307.12375v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01870v1","updated":"2023-10-03T08:15:20Z","published":"2023-10-03T08:15:20Z","title":"DeepDecipher: Accessing and Investigating Neuron Activation in Large\n  Language Models","summary":"  As large language models (LLMs) become more capable, there is an urgent need\nfor interpretable and transparent tools. Current methods are difficult to\nimplement, and accessible tools to analyze model internals are lacking. To\nbridge this gap, we present DeepDecipher - an API and interface for probing\nneurons in transformer models' MLP layers. DeepDecipher makes the outputs of\nadvanced interpretability techniques for LLMs readily available. The\neasy-to-use interface also makes inspecting these complex models more\nintuitive. This paper outlines DeepDecipher's design and capabilities. We\ndemonstrate how to analyze neurons, compare models, and gain insights into\nmodel behavior. For example, we contrast DeepDecipher's functionality with\nsimilar tools like Neuroscope and OpenAI's Neuron Explainer. DeepDecipher\nenables efficient, scalable analysis of LLMs. By granting access to\nstate-of-the-art interpretability methods, DeepDecipher makes LLMs more\ntransparent, trustworthy, and safe. Researchers, engineers, and developers can\nquickly diagnose issues, audit systems, and advance the field.\n","authors":["Albert Garde","Esben Kran","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.01870v1.pdf","comment":"4 pages (9 total), 1 figure, submitted to NeurIPS 2023 Workshop XAIA"},{"id":"http://arxiv.org/abs/2310.01865v1","updated":"2023-10-03T08:08:09Z","published":"2023-10-03T08:08:09Z","title":"Conditional Instrumental Variable Regression with Representation\n  Learning for Causal Inference","summary":"  This paper studies the challenging problem of estimating causal effects from\nobservational data, in the presence of unobserved confounders. The two-stage\nleast square (TSLS) method and its variants with a standard instrumental\nvariable (IV) are commonly used to eliminate confounding bias, including the\nbias caused by unobserved confounders, but they rely on the linearity\nassumption. Besides, the strict condition of unconfounded instruments posed on\na standard IV is too strong to be practical. To address these challenging and\npractical problems of the standard IV method (linearity assumption and the\nstrict condition), in this paper, we use a conditional IV (CIV) to relax the\nunconfounded instrument condition of standard IV and propose a non-linear CIV\nregression with Confounding Balancing Representation Learning, CBRL.CIV, for\njointly eliminating the confounding bias from unobserved confounders and\nbalancing the observed confounders, without the linearity assumption. We\ntheoretically demonstrate the soundness of CBRL.CIV. Extensive experiments on\nsynthetic and two real-world datasets show the competitive performance of\nCBRL.CIV against state-of-the-art IV-based estimators and superiority in\ndealing with the non-linear situation.\n","authors":["Debo Cheng","Ziqi Xu","Jiuyong Li","Lin Liu","Jixue Liu","Thuc Duy Le"],"pdf_url":"https://arxiv.org/pdf/2310.01865v1.pdf","comment":"17pages, 3 figures and 6 tables"},{"id":"http://arxiv.org/abs/2310.01860v1","updated":"2023-10-03T07:49:17Z","published":"2023-10-03T07:49:17Z","title":"High-Probability Convergence for Composite and Distributed Stochastic\n  Minimization and Variational Inequalities with Heavy-Tailed Noise","summary":"  High-probability analysis of stochastic first-order optimization methods\nunder mild assumptions on the noise has been gaining a lot of attention in\nrecent years. Typically, gradient clipping is one of the key algorithmic\ningredients to derive good high-probability guarantees when the noise is\nheavy-tailed. However, if implemented na\\\"ively, clipping can spoil the\nconvergence of the popular methods for composite and distributed optimization\n(Prox-SGD/Parallel SGD) even in the absence of any noise. Due to this reason,\nmany works on high-probability analysis consider only unconstrained\nnon-distributed problems, and the existing results for composite/distributed\nproblems do not include some important special cases (like strongly convex\nproblems) and are not optimal. To address this issue, we propose new stochastic\nmethods for composite and distributed optimization based on the clipping of\nstochastic gradient differences and prove tight high-probability convergence\nresults (including nearly optimal ones) for the new methods. Using similar\nideas, we also develop new methods for composite and distributed variational\ninequalities and analyze the high-probability convergence of these methods.\n","authors":["Eduard Gorbunov","Abdurakhmon Sadiev","Marina Danilova","Samuel Horváth","Gauthier Gidel","Pavel Dvurechensky","Alexander Gasnikov","Peter Richtárik"],"pdf_url":"https://arxiv.org/pdf/2310.01860v1.pdf","comment":"143 pages"},{"id":"http://arxiv.org/abs/2310.01859v1","updated":"2023-10-03T07:48:11Z","published":"2023-10-03T07:48:11Z","title":"Variational Gaussian approximation of the Kushner optimal filter","summary":"  In estimation theory, the Kushner equation provides the evolution of the\nprobability density of the state of a dynamical system given continuous-time\nobservations. Building upon our recent work, we propose a new way to\napproximate the solution of the Kushner equation through tractable variational\nGaussian approximations of two proximal losses associated with the propagation\nand Bayesian update of the probability density. The first is a proximal loss\nbased on the Wasserstein metric and the second is a proximal loss based on the\nFisher metric. The solution to this last proximal loss is given by implicit\nupdates on the mean and covariance that we proposed earlier. These two\nvariational updates can be fused and shown to satisfy a set of stochastic\ndifferential equations on the Gaussian's mean and covariance matrix. This\nGaussian flow is consistent with the Kalman-Bucy and Riccati flows in the\nlinear case and generalize them in the nonlinear one.\n","authors":["Marc Lambert","Silvère Bonnabel","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2310.01859v1.pdf","comment":"Lecture Notes in Computer Science, 2023"},{"id":"http://arxiv.org/abs/2306.05272v3","updated":"2023-10-03T07:37:54Z","published":"2023-06-08T15:20:27Z","title":"Image Clustering via the Principle of Rate Reduction in the Age of\n  Pretrained Models","summary":"  The advent of large pre-trained models has brought about a paradigm shift in\nboth visual representation learning and natural language processing. However,\nclustering unlabeled images, as a fundamental and classic machine learning\nproblem, still lacks an effective solution, particularly for large-scale\ndatasets. In this paper, we propose a novel image clustering pipeline that\nleverages the powerful feature representation of large pre-trained models such\nas CLIP and cluster images effectively and efficiently at scale. We first\ndeveloped a novel algorithm to estimate the number of clusters in a given\ndataset. We then show that the pre-trained features are significantly more\nstructured by further optimizing the rate reduction objective. The resulting\nfeatures may significantly improve the clustering accuracy, e.g., from 57\\% to\n66\\% on ImageNet-1k. Furthermore, by leveraging CLIP's multimodality bridge\nbetween image and text, we develop a simple yet effective self-labeling\nalgorithm that produces meaningful text labels for the clusters. Through\nextensive experiments, we show that our pipeline works well on standard\ndatasets such as CIFAR-10, CIFAR-100, and ImageNet-1k. It also extends to\ndatasets without predefined labels, such as LAION-Aesthetics and WikiArts. We\nreleased the code in https://github.com/LeslieTrue/CPP.\n","authors":["Tianzhe Chu","Shengbang Tong","Tianjiao Ding","Xili Dai","Benjamin David Haeffele","René Vidal","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2306.05272v3.pdf","comment":"23 pages, 14 figures"},{"id":"http://arxiv.org/abs/2309.15499v2","updated":"2023-10-03T07:35:41Z","published":"2023-09-27T08:52:08Z","title":"Bayesian Personalized Federated Learning with Shared and Personalized\n  Uncertainty Representations","summary":"  Bayesian personalized federated learning (BPFL) addresses challenges in\nexisting personalized FL (PFL). BPFL aims to quantify the uncertainty and\nheterogeneity within and across clients towards uncertainty representations by\naddressing the statistical heterogeneity of client data. In PFL, some recent\npreliminary work proposes to decompose hidden neural representations into\nshared and local components and demonstrates interesting results. However, most\nof them do not address client uncertainty and heterogeneity in FL systems,\nwhile appropriately decoupling neural representations is challenging and often\nad hoc. In this paper, we make the first attempt to introduce a general BPFL\nframework to decompose and jointly learn shared and personalized uncertainty\nrepresentations on statistically heterogeneous client data over time. A\nBayesian federated neural network BPFed instantiates BPFL by jointly learning\ncross-client shared uncertainty and client-specific personalized uncertainty\nover statistically heterogeneous and randomly participating clients. We further\ninvolve continual updating of prior distribution in BPFed to speed up the\nconvergence and avoid catastrophic forgetting. Theoretical analysis and\nguarantees are provided in addition to the experimental evaluation of BPFed\nagainst the diversified baselines.\n","authors":["Hui Chen","Hengyu Liu","Longbing Cao","Tiancheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.15499v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01853v1","updated":"2023-10-03T07:34:27Z","published":"2023-10-03T07:34:27Z","title":"Score-based Data Assimilation for a Two-Layer Quasi-Geostrophic Model","summary":"  Data assimilation addresses the problem of identifying plausible state\ntrajectories of dynamical systems given noisy or incomplete observations. In\ngeosciences, it presents challenges due to the high-dimensionality of\ngeophysical dynamical systems, often exceeding millions of dimensions. This\nwork assesses the scalability of score-based data assimilation (SDA), a novel\ndata assimilation method, in the context of such systems. We propose\nmodifications to the score network architecture aimed at significantly reducing\nmemory consumption and execution time. We demonstrate promising results for a\ntwo-layer quasi-geostrophic model.\n","authors":["François Rozet","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2310.01853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11958v2","updated":"2023-10-03T07:26:32Z","published":"2023-08-23T06:57:05Z","title":"Maintaining Plasticity in Continual Learning via Regenerative\n  Regularization","summary":"  In continual learning, plasticity refers to the ability of an agent to\nquickly adapt to new information. Neural networks are known to lose plasticity\nwhen processing non-stationary data streams. In this paper, we propose L2 Init,\na simple approach for maintaining plasticity by incorporating in the loss\nfunction L2 regularization toward initial parameters. This is very similar to\nstandard L2 regularization (L2), the only difference being that L2 regularizes\ntoward the origin. L2 Init is simple to implement and requires selecting only a\nsingle hyper-parameter. The motivation for this method is the same as that of\nmethods that reset neurons or parameter values. Intuitively, when recent losses\nare insensitive to particular parameters, these parameters should drift toward\ntheir initial values. This prepares parameters to adapt quickly to new tasks.\nOn problems representative of different types of nonstationarity in continual\nsupervised learning, we demonstrate that L2 Init most consistently mitigates\nplasticity loss compared to previously proposed approaches.\n","authors":["Saurabh Kumar","Henrik Marklund","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2308.11958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01846v1","updated":"2023-10-03T07:23:22Z","published":"2023-10-03T07:23:22Z","title":"Benchmarking and Improving Generator-Validator Consistency of Language\n  Models","summary":"  As of September 2023, ChatGPT correctly answers \"what is 7+8\" with 15, but\nwhen asked \"7+8=15, True or False\" it responds with \"False\". This inconsistency\nbetween generating and validating an answer is prevalent in language models\n(LMs) and erodes trust. In this paper, we propose a framework for measuring the\nconsistency between generation and validation (which we call\ngenerator-validator consistency, or GV-consistency), finding that even GPT-4, a\nstate-of-the-art LM, is GV-consistent only 76% of the time. To improve the\nconsistency of LMs, we propose to finetune on the filtered generator and\nvalidator responses that are GV-consistent, and call this approach consistency\nfine-tuning. We find that this approach improves GV-consistency of Alpaca-30B\nfrom 60% to 93%, and the improvement extrapolates to unseen tasks and domains\n(e.g., GV-consistency for positive style transfers extrapolates to unseen\nstyles like humor). In addition to improving consistency, consistency\nfine-tuning improves both generator quality and validator accuracy without\nusing any labeled data. Evaluated across 6 tasks, including math questions,\nknowledge-intensive QA, and instruction following, our method improves the\ngenerator quality by 16% and the validator accuracy by 6.3% across all tasks.\n","authors":["Xiang Lisa Li","Vaishnavi Shrivastava","Siyan Li","Tatsunori Hashimoto","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2310.01846v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2310.01845v1","updated":"2023-10-03T07:19:59Z","published":"2023-10-03T07:19:59Z","title":"Zero-Shot Refinement of Buildings' Segmentation Models using SAM","summary":"  Foundation models have excelled in various tasks but are often evaluated on\ngeneral benchmarks. The adaptation of these models for specific domains, such\nas remote sensing imagery, remains an underexplored area. In remote sensing,\nprecise building instance segmentation is vital for applications like urban\nplanning. While Convolutional Neural Networks (CNNs) perform well, their\ngeneralization can be limited. For this aim, we present a novel approach to\nadapt foundation models to address existing models' generalization dropback.\nAmong several models, our focus centers on the Segment Anything Model (SAM), a\npotent foundation model renowned for its prowess in class-agnostic image\nsegmentation capabilities. We start by identifying the limitations of SAM,\nrevealing its suboptimal performance when applied to remote sensing imagery.\nMoreover, SAM does not offer recognition abilities and thus fails to classify\nand tag localized objects. To address these limitations, we introduce different\nprompting strategies, including integrating a pre-trained CNN as a prompt\ngenerator. This novel approach augments SAM with recognition abilities, a first\nof its kind. We evaluated our method on three remote sensing datasets,\nincluding the WHU Buildings dataset, the Massachusetts Buildings dataset, and\nthe AICrowd Mapping Challenge. For out-of-distribution performance on the WHU\ndataset, we achieve a 5.47% increase in IoU and a 4.81% improvement in\nF1-score. For in-distribution performance on the WHU dataset, we observe a\n2.72% and 1.58% increase in True-Positive-IoU and True-Positive-F1 score,\nrespectively. We intend to release our code repository, hoping to inspire\nfurther exploration of foundation models for domain-specific tasks within the\nremote sensing community.\n","authors":["Ali Mayladan","Hasan Nasrallah","Hasan Moughnieh","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00410v2","updated":"2023-10-03T07:05:03Z","published":"2023-09-01T12:07:40Z","title":"Selective Scene Text Removal","summary":"  Scene text removal (STR) is the image transformation task to remove text\nregions in scene images. The conventional STR methods remove all scene text.\nThis means that the existing methods cannot select text to be removed. In this\npaper, we propose a novel task setting named selective scene text removal\n(SSTR) that removes only target words specified by the user. Although SSTR is a\nmore complex task than STR, the proposed multi-module structure enables\nefficient training for SSTR. Experimental results show that the proposed method\ncan remove target words as expected.\n","authors":["Hayato Mitani","Akisato Kimura","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2309.00410v2.pdf","comment":"12 pages, 8 figures, Accepted at the 34th British Machine Vision\n  Conference, code:https://github.com/mitanihayato/Selective-Scene-Text-Removal"},{"id":"http://arxiv.org/abs/2310.01837v1","updated":"2023-10-03T07:01:23Z","published":"2023-10-03T07:01:23Z","title":"Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation","summary":"  Current AI-based methods do not provide comprehensible physical\ninterpretations of the utilized data, extracted features, and\npredictions/inference operations. As a result, deep learning models trained\nusing high-resolution satellite imagery lack transparency and explainability\nand can be merely seen as a black box, which limits their wide-level adoption.\nExperts need help understanding the complex behavior of AI models and the\nunderlying decision-making process. The explainable artificial intelligence\n(XAI) field is an emerging field providing means for robust, practical, and\ntrustworthy deployment of AI models. Several XAI techniques have been proposed\nfor image classification tasks, whereas the interpretation of image\nsegmentation remains largely unexplored. This paper offers to bridge this gap\nby adapting the recent XAI classification algorithms and making them usable for\nmuti-class image segmentation, where we mainly focus on buildings' segmentation\nfrom high-resolution satellite images. To benchmark and compare the performance\nof the proposed approaches, we introduce a new XAI evaluation methodology and\nmetric based on \"Entropy\" to measure the model uncertainty. Conventional XAI\nevaluation methods rely mainly on feeding area-of-interest regions from the\nimage back to the pre-trained (utility) model and then calculating the average\nchange in the probability of the target class. Those evaluation metrics lack\nthe needed robustness, and we show that using Entropy to monitor the model\nuncertainty in segmenting the pixels within the target class is more suitable.\nWe hope this work will pave the way for additional XAI research for image\nsegmentation and applications in the remote sensing discipline.\n","authors":["Abdul Karim Gizzini","Mustafa Shukor","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01835v1","updated":"2023-10-03T06:58:45Z","published":"2023-10-03T06:58:45Z","title":"EMBERSim: A Large-Scale Databank for Boosting Similarity Search in\n  Malware Analysis","summary":"  In recent years there has been a shift from heuristics-based malware\ndetection towards machine learning, which proves to be more robust in the\ncurrent heavily adversarial threat landscape. While we acknowledge machine\nlearning to be better equipped to mine for patterns in the increasingly high\namounts of similar-looking files, we also note a remarkable scarcity of the\ndata available for similarity-targeted research. Moreover, we observe that the\nfocus in the few related works falls on quantifying similarity in malware,\noften overlooking the clean data. This one-sided quantification is especially\ndangerous in the context of detection bypass. We propose to address the\ndeficiencies in the space of similarity research on binary files, starting from\nEMBER - one of the largest malware classification data sets. We enhance EMBER\nwith similarity information as well as malware class tags, to enable further\nresearch in the similarity space. Our contribution is threefold: (1) we publish\nEMBERSim, an augmented version of EMBER, that includes similarity-informed\ntags; (2) we enrich EMBERSim with automatically determined malware class tags\nusing the open-source tool AVClass on VirusTotal data and (3) we describe and\nshare the implementation for our class scoring technique and leaf similarity\nmethod.\n","authors":["Dragos Georgian Corlatescu","Alexandru Dinu","Mihaela Gaman","Paul Sumedrea"],"pdf_url":"https://arxiv.org/pdf/2310.01835v1.pdf","comment":"Accepted at the 37th Conference on Neural Information Processing\n  Systems (NeurIPS 2023) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2310.01828v1","updated":"2023-10-03T06:51:48Z","published":"2023-10-03T06:51:48Z","title":"Trainable Noise Model as an XAI evaluation method: application on Sobol\n  for remote sensing image segmentation","summary":"  eXplainable Artificial Intelligence (XAI) has emerged as an essential\nrequirement when dealing with mission-critical applications, ensuring\ntransparency and interpretability of the employed black box AI models. The\nsignificance of XAI spans various domains, from healthcare to finance, where\nunderstanding the decision-making process of deep learning algorithms is\nessential. Most AI-based computer vision models are often black boxes; hence,\nproviding explainability of deep neural networks in image processing is crucial\nfor their wide adoption and deployment in medical image analysis, autonomous\ndriving, and remote sensing applications. Recently, several XAI methods for\nimage classification tasks have been introduced. On the contrary, image\nsegmentation has received comparatively less attention in the context of\nexplainability, although it is a fundamental task in computer vision\napplications, especially in remote sensing. Only some research proposes\ngradient-based XAI algorithms for image segmentation. This paper adapts the\nrecent gradient-free Sobol XAI method for semantic segmentation. To measure the\nperformance of the Sobol method for segmentation, we propose a quantitative XAI\nevaluation method based on a learnable noise model. The main objective of this\nmodel is to induce noise on the explanation maps, where higher induced noise\nsignifies low accuracy and vice versa. A benchmark analysis is conducted to\nevaluate and compare performance of three XAI methods, including Seg-Grad-CAM,\nSeg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation\ntechnique. This constitutes the first attempt to run and evaluate XAI methods\nusing high-resolution satellite images.\n","authors":["Hossein Shreim","Abdul Karim Gizzini","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01825v1","updated":"2023-10-03T06:42:28Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":"  Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01824v1","updated":"2023-10-03T06:41:18Z","published":"2023-10-03T06:41:18Z","title":"Mini-BEHAVIOR: A Procedurally Generated Benchmark for Long-horizon\n  Decision-Making in Embodied AI","summary":"  We present Mini-BEHAVIOR, a novel benchmark for embodied AI that challenges\nagents to use reasoning and decision-making skills to solve complex activities\nthat resemble everyday human challenges. The Mini-BEHAVIOR environment is a\nfast, realistic Gridworld environment that offers the benefits of rapid\nprototyping and ease of use while preserving a symbolic level of physical\nrealism and complexity found in complex embodied AI benchmarks. We introduce\nkey features such as procedural generation, to enable the creation of countless\ntask variations and support open-ended learning. Mini-BEHAVIOR provides\nimplementations of various household tasks from the original BEHAVIOR\nbenchmark, along with starter code for data collection and reinforcement\nlearning agent training. In essence, Mini-BEHAVIOR offers a fast, open-ended\nbenchmark for evaluating decision-making and planning solutions in embodied AI.\nIt serves as a user-friendly entry point for research and facilitates the\nevaluation and development of solutions, simplifying their assessment and\ndevelopment while advancing the field of embodied AI. Code is publicly\navailable at https://github.com/StanfordVL/mini_behavior.\n","authors":["Emily Jin","Jiaheng Hu","Zhuoyi Huang","Ruohan Zhang","Jiajun Wu","Li Fei-Fei","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2310.01824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01821v1","updated":"2023-10-03T06:33:05Z","published":"2023-10-03T06:33:05Z","title":"MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural\n  Radiance Fields","summary":"  Neural radiance fields (NeRFs) have shown impressive results for novel view\nsynthesis. However, they depend on the repetitive use of a single-input\nsingle-output multilayer perceptron (SISO MLP) that maps 3D coordinates and\nview direction to the color and volume density in a sample-wise manner, which\nslows the rendering. We propose a multi-input multi-output NeRF (MIMO-NeRF)\nthat reduces the number of MLPs running by replacing the SISO MLP with a MIMO\nMLP and conducting mappings in a group-wise manner. One notable challenge with\nthis approach is that the color and volume density of each point can differ\naccording to a choice of input coordinates in a group, which can lead to some\nnotable ambiguity. We also propose a self-supervised learning method that\nregularizes the MIMO MLP with multiple fast reformulated MLPs to alleviate this\nambiguity without using pretrained models. The results of a comprehensive\nexperimental evaluation including comparative and ablation studies are\npresented to show that MIMO-NeRF obtains a good trade-off between speed and\nquality with a reasonable training time. We then demonstrate that MIMO-NeRF is\ncompatible with and complementary to previous advancements in NeRFs by applying\nit to two representative fast NeRFs, i.e., a NeRF with sample reduction\n(DONeRF) and a NeRF with alternative representations (TensoRF).\n","authors":["Takuhiro Kaneko"],"pdf_url":"https://arxiv.org/pdf/2310.01821v1.pdf","comment":"Accepted to ICCV 2023. Project page:\n  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/mimo-nerf/"},{"id":"http://arxiv.org/abs/2310.01820v1","updated":"2023-10-03T06:25:14Z","published":"2023-10-03T06:25:14Z","title":"Towards Robust Fidelity for Evaluating Explainability of Graph Neural\n  Networks","summary":"  Graph Neural Networks (GNNs) are neural models that leverage the dependency\nstructure in graphical data via message passing among the graph nodes. GNNs\nhave emerged as pivotal architectures in analyzing graph-structured data, and\ntheir expansive application in sensitive domains requires a comprehensive\nunderstanding of their decision-making processes -- necessitating a framework\nfor GNN explainability. An explanation function for GNNs takes a pre-trained\nGNN along with a graph as input, to produce a `sufficient statistic' subgraph\nwith respect to the graph label. A main challenge in studying GNN\nexplainability is to provide fidelity measures that evaluate the performance of\nthese explanation functions. This paper studies this foundational challenge,\nspotlighting the inherent limitations of prevailing fidelity metrics, including\n$Fid_+$, $Fid_-$, and $Fid_\\Delta$. Specifically, a formal,\ninformation-theoretic definition of explainability is introduced and it is\nshown that existing metrics often fail to align with this definition across\nvarious statistical scenarios. The reason is due to potential distribution\nshifts when subgraphs are removed in computing these fidelity measures.\nSubsequently, a robust class of fidelity measures are introduced, and it is\nshown analytically that they are resilient to distribution shift issues and are\napplicable in a wide range of scenarios. Extensive empirical analysis on both\nsynthetic and real datasets are provided to illustrate that the proposed\nmetrics are more coherent with gold standard metrics.\n","authors":["Xu Zheng","Farhad Shirani","Tianchun Wang","Wei Cheng","Zhuomin Chen","Haifeng Chen","Hua Wei","Dongsheng Luo"],"pdf_url":"https://arxiv.org/pdf/2310.01820v1.pdf","comment":"23 Pages, 10 figures, under review"},{"id":"http://arxiv.org/abs/2310.01818v1","updated":"2023-10-03T06:16:03Z","published":"2023-10-03T06:16:03Z","title":"AutoLoRa: A Parameter-Free Automated Robust Fine-Tuning Framework","summary":"  Robust Fine-Tuning (RFT) is a low-cost strategy to obtain adversarial\nrobustness in downstream applications, without requiring a lot of computational\nresources and collecting significant amounts of data. This paper uncovers an\nissue with the existing RFT, where optimizing both adversarial and natural\nobjectives through the feature extractor (FE) yields significantly divergent\ngradient directions. This divergence introduces instability in the optimization\nprocess, thereby hindering the attainment of adversarial robustness and\nrendering RFT highly sensitive to hyperparameters. To mitigate this issue, we\npropose a low-rank (LoRa) branch that disentangles RFT into two distinct\ncomponents: optimizing natural objectives via the LoRa branch and adversarial\nobjectives via the FE. Besides, we introduce heuristic strategies for\nautomating the scheduling of the learning rate and the scalars of loss terms.\nExtensive empirical evaluations demonstrate that our proposed automated RFT\ndisentangled via the LoRa branch (AutoLoRa) achieves new state-of-the-art\nresults across a range of downstream tasks. AutoLoRa holds significant\npractical utility, as it automatically converts a pre-trained FE into an\nadversarially robust model for downstream tasks without the need for searching\nhyperparameters.\n","authors":["Xilie Xu","Jingfeng Zhang","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.01818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18030v2","updated":"2023-10-03T06:11:48Z","published":"2023-05-25T19:41:40Z","title":"Automated Search-Space Generation Neural Architecture Search","summary":"  To search an optimal sub-network within a general deep neural network (DNN),\nexisting neural architecture search (NAS) methods typically rely on\nhandcrafting a search space beforehand. Such requirements make it challenging\nto extend them onto general scenarios without significant human expertise and\nmanual intervention. To overcome the limitations, we propose Automated\nSearch-Space Generation Neural Architecture Search (ASGNAS), perhaps the first\nautomated system to train general DNNs that cover all candidate connections and\noperations and produce high-performing sub-networks in the one shot manner.\nTechnologically, ASGNAS delivers three noticeable contributions to minimize\nhuman efforts: (i) automated search space generation for general DNNs; (ii) a\nHierarchical Half-Space Projected Gradient (H2SPG) that leverages the hierarchy\nand dependency within generated search space to ensure the network validity\nduring optimization, and reliably produces a solution with both high\nperformance and hierarchical group sparsity; and (iii) automated sub-network\nconstruction upon the H2SPG solution. Numerically, we demonstrate the\neffectiveness of ASGNAS on a variety of general DNNs, including RegNet,\nStackedUnets, SuperResNet, and DARTS, over benchmark datasets such as CIFAR10,\nFashion-MNIST, ImageNet, STL-10 , and SVNH. The sub-networks computed by ASGNAS\nachieve competitive even superior performance compared to the starting full\nDNNs and other state-of-the-arts. The library will be released at\nhttps://github.com/tianyic/only_train_once.\n","authors":["Tianyi Chen","Luming Liang","Tianyu Ding","Ilya Zharkov"],"pdf_url":"https://arxiv.org/pdf/2305.18030v2.pdf","comment":"Graph visualization for DARTS, SuperResNet are omitted for arXiv\n  version due to exceeding page dimension limit. Please refer to the\n  open-review version for taking the visualizations"},{"id":"http://arxiv.org/abs/2310.01815v1","updated":"2023-10-03T06:09:59Z","published":"2023-10-03T06:09:59Z","title":"What Determines the Price of NFTs?","summary":"  In the evolving landscape of digital art, Non-Fungible Tokens (NFTs) have\nemerged as a groundbreaking platform, bridging the realms of art and\ntechnology. NFTs serve as the foundational framework that has revolutionized\nthe market for digital art, enabling artists to showcase and monetize their\ncreations in unprecedented ways. NFTs combine metadata stored on the blockchain\nwith off-chain data, such as images, to create a novel form of digital\nownership. It is not fully understood how these factors come together to\ndetermine NFT prices. In this study, we analyze both on-chain and off-chain\ndata of NFT collections trading on OpenSea to understand what influences NFT\npricing. Our results show that while text and image data of the NFTs can be\nused to explain price variations within collections, the extracted features do\nnot generalize to new, unseen collections. Furthermore, we find that an NFT\ncollection's trading volume often relates to its online presence, like social\nmedia followers and website traffic.\n","authors":["Vivian Ziemke","Benjamin Estermann","Roger Wattenhofer","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07505v2","updated":"2023-10-03T06:09:18Z","published":"2023-08-15T00:08:43Z","title":"Data Race Detection Using Large Language Models","summary":"  Large language models (LLMs) are demonstrating significant promise as an\nalternate strategy to facilitate analyses and optimizations of high-performance\ncomputing programs, circumventing the need for resource-intensive manual tool\ncreation. In this paper, we explore a novel LLM-based data race detection\napproach combining prompting engineering and fine-tuning techniques. We create\na dedicated dataset named DRB-ML, which is derived from DataRaceBench, with\nfine-grain labels showing the presence of data race pairs and their associated\nvariables, line numbers, and read/write information. DRB-ML is then used to\nevaluate representative LLMs and fine-tune open-source ones. Our experiment\nshows that LLMs can be a viable approach to data race detection. However, they\nstill cannot compete with traditional data race detection tools when we need\ndetailed information about variable pairs causing data races.\n","authors":["Le Chen","Xianzhong Ding","Murali Emani","Tristan Vanderbruggen","Pei-hung Lin","Chuanhua Liao"],"pdf_url":"https://arxiv.org/pdf/2308.07505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01808v1","updated":"2023-10-03T05:42:53Z","published":"2023-10-03T05:42:53Z","title":"Simulation-based Inference with the Generalized Kullback-Leibler\n  Divergence","summary":"  In Simulation-based Inference, the goal is to solve the inverse problem when\nthe likelihood is only known implicitly. Neural Posterior Estimation commonly\nfits a normalized density estimator as a surrogate model for the posterior.\nThis formulation cannot easily fit unnormalized surrogates because it optimizes\nthe Kullback-Leibler divergence. We propose to optimize a generalized\nKullback-Leibler divergence that accounts for the normalization constant in\nunnormalized distributions. The objective recovers Neural Posterior Estimation\nwhen the model class is normalized and unifies it with Neural Ratio Estimation,\ncombining both into a single objective. We investigate a hybrid model that\noffers the best of both worlds by learning a normalized base distribution and a\nlearned ratio. We also present benchmark results.\n","authors":["Benjamin Kurt Miller","Marco Federici","Christoph Weniger","Patrick Forré"],"pdf_url":"https://arxiv.org/pdf/2310.01808v1.pdf","comment":"Accepted at Synergy of Scientific and Machine Learning Modeling ICML\n  2023 Workshop https://syns-ml.github.io/2023/contributions/"},{"id":"http://arxiv.org/abs/2310.01807v1","updated":"2023-10-03T05:40:56Z","published":"2023-10-03T05:40:56Z","title":"Discrete, compositional, and symbolic representations through attractor\n  dynamics","summary":"  Compositionality is an important feature of discrete symbolic systems, such\nas language and programs, as it enables them to have infinite capacity despite\na finite symbol set. It serves as a useful abstraction for reasoning in both\ncognitive science and in AI, yet the interface between continuous and symbolic\nprocessing is often imposed by fiat at the algorithmic level, such as by means\nof quantization or a softmax sampling step. In this work, we explore how\ndiscretization could be implemented in a more neurally plausible manner through\nthe modeling of attractor dynamics that partition the continuous representation\nspace into basins that correspond to sequences of symbols. Building on\nestablished work in attractor networks and introducing novel training methods,\nwe show that imposing structure in the symbolic space can produce\ncompositionality in the attractor-supported representation space of rich\nsensory inputs. Lastly, we argue that our model exhibits the process of an\ninformation bottleneck that is thought to play a role in conscious experience,\ndecomposing the rich information of a sensory input into stable components\nencoding symbolic information.\n","authors":["Andrew Nam","Eric Elmoznino","Nikolay Malkin","Chen Sun","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2310.01807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13649v2","updated":"2023-10-03T05:30:36Z","published":"2023-06-23T17:56:26Z","title":"Generalized Knowledge Distillation for Auto-regressive Language Models","summary":"  Knowledge distillation (KD) is widely used for compressing a teacher model to\nreduce its inference cost and memory footprint, by training a smaller student\nmodel. However, current KD methods for auto-regressive sequence models suffer\nfrom distribution mismatch between output sequences seen during training and\nthose generated by the student during inference. To address this issue, we\nintroduce Generalized Knowledge Distillation (GKD). Instead of solely relying\non a fixed set of output sequences, GKD trains the student on its\nself-generated output sequences by leveraging feedback from the teacher on such\nsequences. Unlike supervised KD approaches, GKD also offers the flexibility to\nemploy alternative loss functions between the student and teacher, which can be\nuseful when the student lacks the expressivity to mimic the teacher's\ndistribution. Furthermore, GKD facilitates the seamless integration of\ndistillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for\ndistilling auto-regressive T5 language models on summarization, translation,\nand arithmetic reasoning tasks as well as task-agnostic instruction tuning.\n","authors":["Rishabh Agarwal","Nino Vieillard","Yongchao Zhou","Piotr Stanczyk","Sabela Ramos","Matthieu Geist","Olivier Bachem"],"pdf_url":"https://arxiv.org/pdf/2306.13649v2.pdf","comment":"First two authors contributed equally. Added new results and\n  experiment details"},{"id":"http://arxiv.org/abs/2310.00339v2","updated":"2023-10-03T05:22:37Z","published":"2023-09-30T10:51:27Z","title":"FedLPA: Personalized One-shot Federated Learning with Layer-Wise\n  Posterior Aggregation","summary":"  Efficiently aggregating trained neural networks from local clients into a\nglobal model on a server is a widely researched topic in federated learning.\nRecently, motivated by diminishing privacy concerns, mitigating potential\nattacks, and reducing the overhead of communication, one-shot federated\nlearning (i.e., limiting client-server communication into a single round) has\ngained popularity among researchers. However, the one-shot aggregation\nperformances are sensitively affected by the non-identical training data\ndistribution, which exhibits high statistical heterogeneity in some real-world\nscenarios. To address this issue, we propose a novel one-shot aggregation\nmethod with Layer-wise Posterior Aggregation, named FedLPA. FedLPA aggregates\nlocal models to obtain a more accurate global model without requiring extra\nauxiliary datasets or exposing any confidential local information, e.g., label\ndistributions. To effectively capture the statistics maintained in the biased\nlocal datasets in the practical non-IID scenario, we efficiently infer the\nposteriors of each layer in each local model using layer-wise Laplace\napproximation and aggregate them to train the global parameters. Extensive\nexperimental results demonstrate that FedLPA significantly improves learning\nperformance over state-of-the-art methods across several metrics.\n","authors":["Xiang Liu","Liangxi Liu","Feiyang Ye","Yunheng Shen","Xia Li","Linshan Jiang","Jialin Li"],"pdf_url":"https://arxiv.org/pdf/2310.00339v2.pdf","comment":"26pages, 6 figures"},{"id":"http://arxiv.org/abs/2302.02676v7","updated":"2023-10-03T05:04:47Z","published":"2023-02-06T10:28:16Z","title":"Chain of Hindsight Aligns Language Models with Feedback","summary":"  Learning from human preferences is important for language models to match\nhuman needs and to align with human and social values. Prior works have\nachieved remarkable successes by learning from human feedback to understand and\nfollow instructions. Nonetheless, these methods are either founded on\nhand-picked model generations that are favored by human annotators, rendering\nthem inefficient in terms of data utilization and challenging to apply in\ngeneral, or they depend on reinforcement learning, which often suffers from\nimperfect reward functions and relies on extremely challenging optimizations.\nIn this work, we propose a novel technique, Chain of Hindsight, that is easy to\noptimize and can learn from any form of feedback, regardless of its polarity.\nOur idea is inspired by how humans learn from extensive feedback presented in\nthe form of languages. We convert all types of feedback into sequences of\nsentences, which are then used to fine-tune the model, allowing us to take\nadvantage of the language comprehension capabilities of language models. We\ncondition the model on a sequence of model generations paired with feedback. By\ndoing so, the model is trained to generate outputs based on feedback, while\nlearning to identify and correct negative attributes or errors. Applying our\nmethod to large language models, we observed that Chain of Hindsight\nsignificantly surpasses previous methods in aligning language models with human\npreferences. We report significant improvements on summarization and dialogue\nbenchmarks, with our approach markedly preferred in human evaluations.\n","authors":["Hao Liu","Carmelo Sferrazza","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2302.02676v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02130v2","updated":"2023-10-03T04:47:17Z","published":"2023-09-05T11:16:47Z","title":"Asymmetric Momentum: A Rethinking of Gradient Descent","summary":"  Through theoretical and experimental validation, unlike all existing adaptive\nmethods like Adam which penalize frequently-changing parameters and are only\napplicable to sparse gradients, we propose the simplest SGD enhanced method,\nLoss-Controlled Asymmetric Momentum(LCAM). By averaging the loss, we divide\ntraining process into different loss phases and using different momentum. It\nnot only can accelerates slow-changing parameters for sparse gradients, similar\nto adaptive optimizers, but also can choose to accelerates frequently-changing\nparameters for non-sparse gradients, thus being adaptable to all types of\ndatasets. We reinterpret the machine learning training process through the\nconcepts of weight coupling and weight traction, and experimentally validate\nthat weights have directional specificity, which are correlated with the\nspecificity of the dataset. Thus interestingly, we observe that in non-sparse\ngradients, frequently-changing parameters should actually be accelerated, which\nis completely opposite to traditional adaptive perspectives. Compared to\ntraditional SGD with momentum, this algorithm separates the weights without\nadditional computational costs. It is noteworthy that this method relies on the\nnetwork's ability to extract complex features. We primarily use Wide Residual\nNetworks for our research, employing the classic datasets Cifar10 and Cifar100\nto test the ability for feature separation and conclude phenomena that are much\nmore important than just accuracy rates. Finally, compared to classic SGD\ntuning methods, while using WRN on these two datasets and with nearly half the\ntraining epochs, we achieve equal or better test accuracy.\n","authors":["Gongyue Zhang","Dinghuang Zhang","Shuwen Zhao","Donghan Liu","Carrie M. Toptan","Honghai Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01794v1","updated":"2023-10-03T04:42:44Z","published":"2023-10-03T04:42:44Z","title":"GNNX-BENCH: Unravelling the Utility of Perturbation-based GNN Explainers\n  through In-depth Benchmarking","summary":"  Numerous explainability methods have been proposed to shed light on the inner\nworkings of GNNs. Despite the inclusion of empirical evaluations in all the\nproposed algorithms, the interrogative aspects of these evaluations lack\ndiversity. As a result, various facets of explainability pertaining to GNNs,\nsuch as a comparative analysis of counterfactual reasoners, their stability to\nvariational factors such as different GNN architectures, noise, stochasticity\nin non-convex loss surfaces, feasibility amidst domain constraints, and so\nforth, have yet to be formally investigated. Motivated by this need, we present\na benchmarking study on perturbation-based explainability methods for GNNs,\naiming to systematically evaluate and compare a wide range of explainability\ntechniques. Among the key findings of our study, we identify the Pareto-optimal\nmethods that exhibit superior efficacy and stability in the presence of noise.\nNonetheless, our study reveals that all algorithms are affected by stability\nissues when faced with noisy data. Furthermore, we have established that the\ncurrent generation of counterfactual explainers often fails to provide feasible\nrecourses due to violations of topological constraints encoded by\ndomain-specific considerations. Overall, this benchmarking study empowers\nstakeholders in the field of GNNs with a comprehensive understanding of the\nstate-of-the-art explainability methods, potential research problems for\nfurther enhancement, and the implications of their application in real-world\nscenarios.\n","authors":["Mert Kosan","Samidha Verma","Burouj Armgaan","Khushbu Pahwa","Ambuj Singh","Sourav Medya","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2310.01794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05825v2","updated":"2023-10-03T04:36:34Z","published":"2023-02-12T00:39:25Z","title":"Koopman-based generalization bound: New aspect for full-rank weights","summary":"  We propose a new bound for generalization of neural networks using Koopman\noperators. Whereas most of existing works focus on low-rank weight matrices, we\nfocus on full-rank weight matrices. Our bound is tighter than existing\nnorm-based bounds when the condition numbers of weight matrices are small.\nEspecially, it is completely independent of the width of the network if the\nweight matrices are orthogonal. Our bound does not contradict to the existing\nbounds but is a complement to the existing bounds. As supported by several\nexisting empirical results, low-rankness is not the only reason for\ngeneralization. Furthermore, our bound can be combined with the existing bounds\nto obtain a tighter bound. Our result sheds new light on understanding\ngeneralization of neural networks with full-rank weight matrices, and it\nprovides a connection between operator-theoretic analysis and generalization of\nneural networks.\n","authors":["Yuka Hashimoto","Sho Sonoda","Isao Ishikawa","Atsushi Nitanda","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2302.05825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09241v5","updated":"2023-10-03T04:25:41Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n  Unexploitable Data with Learnable Examples","summary":"  Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v5.pdf","comment":"Accepted in MM 2023"},{"id":"http://arxiv.org/abs/2310.00535v2","updated":"2023-10-03T04:23:26Z","published":"2023-10-01T01:21:35Z","title":"JoMA: Demystifying Multilayer Transformers via JOint Dynamics of MLP and\n  Attention","summary":"  We propose Joint MLP/Attention (JoMA) dynamics, a novel mathematical\nframework to understand the training procedure of multilayer Transformer\narchitectures. This is achieved by integrating out the self-attention layer in\nTransformers, producing a modified dynamics of MLP layers only. JoMA removes\nunrealistic assumptions in previous analysis (e.g., lack of residual\nconnection) and predicts that the attention first becomes sparse (to learn\nsalient tokens), then dense (to learn less salient tokens) in the presence of\nnonlinear activations, while in the linear case, it is consistent with existing\nworks that show attention becomes sparse over time. We leverage JoMA to\nqualitatively explains how tokens are combined to form hierarchies in\nmultilayer Transformers, when the input tokens are generated by a latent\nhierarchical generative model. Experiments on models trained from real-world\ndataset (Wikitext2/Wikitext103) and various pre-trained models (OPT, Pythia)\nverify our theoretical findings.\n","authors":["Yuandong Tian","Yiping Wang","Zhenyu Zhang","Beidi Chen","Simon Du"],"pdf_url":"https://arxiv.org/pdf/2310.00535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01783v1","updated":"2023-10-03T04:14:17Z","published":"2023-10-03T04:14:17Z","title":"Can large language models provide useful feedback on research papers? A\n  large-scale empirical analysis","summary":"  Expert feedback lays the foundation of rigorous research. However, the rapid\ngrowth of scholarly production and intricate knowledge specialization challenge\nthe conventional scientific feedback mechanisms. High-quality peer reviews are\nincreasingly difficult to obtain. Researchers who are more junior or from\nunder-resourced settings have especially hard times getting timely feedback.\nWith the breakthrough of large language models (LLM) such as GPT-4, there is\ngrowing interest in using LLMs to generate scientific feedback on research\nmanuscripts. However, the utility of LLM-generated feedback has not been\nsystematically studied. To address this gap, we created an automated pipeline\nusing GPT-4 to provide comments on the full PDFs of scientific papers. We\nevaluated the quality of GPT-4's feedback through two large-scale studies. We\nfirst quantitatively compared GPT-4's generated feedback with human peer\nreviewer feedback in 15 Nature family journals (3,096 papers in total) and the\nICLR machine learning conference (1,709 papers). The overlap in the points\nraised by GPT-4 and by human reviewers (average overlap 30.85% for Nature\njournals, 39.23% for ICLR) is comparable to the overlap between two human\nreviewers (average overlap 28.58% for Nature journals, 35.25% for ICLR). The\noverlap between GPT-4 and human reviewers is larger for the weaker papers. We\nthen conducted a prospective user study with 308 researchers from 110 US\ninstitutions in the field of AI and computational biology to understand how\nresearchers perceive feedback generated by our GPT-4 system on their own\npapers. Overall, more than half (57.4%) of the users found GPT-4 generated\nfeedback helpful/very helpful and 82.4% found it more beneficial than feedback\nfrom at least some human reviewers. While our findings show that LLM-generated\nfeedback can help researchers, we also identify several limitations.\n","authors":["Weixin Liang","Yuhui Zhang","Hancheng Cao","Binglu Wang","Daisy Ding","Xinyu Yang","Kailas Vodrahalli","Siyu He","Daniel Smith","Yian Yin","Daniel McFarland","James Zou"],"pdf_url":"https://arxiv.org/pdf/2310.01783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00429v2","updated":"2023-10-03T03:59:41Z","published":"2023-09-30T16:41:04Z","title":"On the Stability of Iterative Retraining of Generative Models on their\n  own Data","summary":"  Deep generative models have made tremendous progress in modeling complex\ndata, often exhibiting generation quality that surpasses a typical human's\nability to discern the authenticity of samples. Undeniably, a key driver of\nthis success is enabled by the massive amounts of web-scale data consumed by\nthese models. Due to these models' striking performance and ease of\navailability, the web will inevitably be increasingly populated with synthetic\ncontent. Such a fact directly implies that future iterations of generative\nmodels must contend with the reality that their training is curated from both\nclean data and artificially generated data from past models. In this paper, we\ndevelop a framework to rigorously study the impact of training generative\nmodels on mixed datasets (of real and synthetic data) on their stability. We\nfirst prove the stability of iterative training under the condition that the\ninitial generative models approximate the data distribution well enough and the\nproportion of clean training data (w.r.t. synthetic data) is large enough. We\nempirically validate our theory on both synthetic and natural images by\niteratively training normalizing flows and state-of-the-art diffusion models on\nCIFAR10 and FFHQ.\n","authors":["Quentin Bertrand","Avishek Joey Bose","Alexandre Duplessis","Marco Jiralerspong","Gauthier Gidel"],"pdf_url":"https://arxiv.org/pdf/2310.00429v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.09075v3","updated":"2023-10-03T13:06:47Z","published":"2023-09-16T18:48:40Z","title":"Music Generation based on Generative Adversarial Networks with\n  Transformer","summary":"  Autoregressive models based on Transformers have become the prevailing\napproach for generating music compositions that exhibit comprehensive musical\nstructure. These models are typically trained by minimizing the negative\nlog-likelihood (NLL) of the observed sequence in an autoregressive manner.\nHowever, when generating long sequences, the quality of samples from these\nmodels tends to significantly deteriorate due to exposure bias. To address this\nissue, we leverage classifiers trained to differentiate between real and\nsampled sequences to identify these failures. This observation motivates our\nexploration of adversarial losses as a complement to the NLL objective. We\nemploy a pre-trained Span-BERT model as the discriminator in the Generative\nAdversarial Network (GAN) framework, which enhances training stability in our\nexperiments. To optimize discrete sequences within the GAN framework, we\nutilize the Gumbel-Softmax trick to obtain a differentiable approximation of\nthe sampling process. Additionally, we partition the sequences into smaller\nchunks to ensure that memory constraints are met. Through human evaluations and\nthe introduction of a novel discriminative metric, we demonstrate that our\napproach outperforms a baseline model trained solely on likelihood\nmaximization.\n","authors":["Ziyi Jiang","Ruoxue Wu","Zhenghan Chen","Xiaoxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2309.09075v3.pdf","comment":"The results exist serious factual error"},{"id":"http://arxiv.org/abs/2310.01978v1","updated":"2023-10-03T11:42:29Z","published":"2023-10-03T11:42:29Z","title":"Online Multimedia Verification with Computational Tools and OSINT:\n  Russia-Ukraine Conflict Case Studies","summary":"  This paper investigates the use of computational tools and Open-Source\nIntelligence (OSINT) techniques for verifying online multimedia content, with a\nspecific focus on real-world cases from the Russia-Ukraine conflict. Over a\nnine-month period from April to December 2022, we examine verification\nworkflows, tools, and case studies published by \\faktiskbar. Our study\nshowcases the effectiveness of diverse resources, including AI tools,\ngeolocation tools, internet archives, and social media monitoring platforms, in\nenabling journalists and fact-checkers to efficiently process and corroborate\nevidence, ensuring the dissemination of accurate information. This research\nunderscores the vital role of computational tools and OSINT techniques in\npromoting evidence-based reporting and combatting misinformation. We also touch\non the current limitations of available tools and prospects for future\ndevelopments in multimedia verification.\n","authors":["Sohail Ahmed Khan","Jan Gunnar Furuly","Henrik Brattli Vold","Rano Tahseen","Duc-Tien Dang-Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.01978v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2309.11500v3","updated":"2023-10-03T11:37:40Z","published":"2023-09-20T17:59:32Z","title":"A Large-scale Dataset for Audio-Language Representation Learning","summary":"  The AI community has made significant strides in developing powerful\nfoundation models, driven by large-scale multimodal datasets. However, in the\naudio representation learning community, the present audio-language datasets\nsuffer from limitations such as insufficient volume, simplistic content, and\narduous collection procedures. To tackle these challenges, we present an\ninnovative and automatic audio caption generation pipeline based on a series of\npublic tools or APIs, and construct a large-scale, high-quality, audio-language\ndataset, named as Auto-ACD, comprising over 1.9M audio-text pairs. To\ndemonstrate the effectiveness of the proposed dataset, we train popular models\non our dataset and show performance improvement on various downstream tasks,\nnamely, audio-language retrieval, audio captioning, environment classification.\nIn addition, we establish a novel test set and provide a benchmark for\naudio-text tasks. The proposed dataset will be released at\nhttps://auto-acd.github.io/.\n","authors":["Luoyi Sun","Xuenan Xu","Mengyue Wu","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2309.11500v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02422v1","updated":"2023-10-03T20:36:03Z","published":"2023-10-03T20:36:03Z","title":"OneAdapt: Fast Adaptation for Deep Learning Applications via\n  Backpropagation","summary":"  Deep learning inference on streaming media data, such as object detection in\nvideo or LiDAR feeds and text extraction from audio waves, is now ubiquitous.\nTo achieve high inference accuracy, these applications typically require\nsignificant network bandwidth to gather high-fidelity data and extensive GPU\nresources to run deep neural networks (DNNs). While the high demand for network\nbandwidth and GPU resources could be substantially reduced by optimally\nadapting the configuration knobs, such as video resolution and frame rate,\ncurrent adaptation techniques fail to meet three requirements simultaneously:\nadapt configurations (i) with minimum extra GPU or bandwidth overhead; (ii) to\nreach near-optimal decisions based on how the data affects the final DNN's\naccuracy, and (iii) do so for a range of configuration knobs. This paper\npresents OneAdapt, which meets these requirements by leveraging a\ngradient-ascent strategy to adapt configuration knobs. The key idea is to\nembrace DNNs' differentiability to quickly estimate the accuracy's gradient to\neach configuration knob, called AccGrad. Specifically, OneAdapt estimates\nAccGrad by multiplying two gradients: InputGrad (i.e. how each configuration\nknob affects the input to the DNN) and DNNGrad (i.e. how the DNN input affects\nthe DNN inference output). We evaluate OneAdapt across five types of\nconfigurations, four analytic tasks, and five types of input data. Compared to\nstate-of-the-art adaptation schemes, OneAdapt cuts bandwidth usage and GPU\nusage by 15-59% while maintaining comparable accuracy or improves accuracy by\n1-5% while using equal or fewer resources.\n","authors":["Kuntai Du","Yuhan Liu","Yitian Hao","Qizheng Zhang","Haodong Wang","Yuyang Huang","Ganesh Ananthanarayanan","Junchen Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.02422v1.pdf","comment":"SoCC' 23"}]},"2023-10-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.03026v1","updated":"2023-10-04T17:59:49Z","published":"2023-10-04T17:59:49Z","title":"LanguageMPC: Large Language Models as Decision Makers for Autonomous\n  Driving","summary":"  Existing learning-based autonomous driving (AD) systems face challenges in\ncomprehending high-level information, generalizing to rare events, and\nproviding interpretability. To address these problems, this work employs Large\nLanguage Models (LLMs) as a decision-making component for complex AD scenarios\nthat require human commonsense understanding. We devise cognitive pathways to\nenable comprehensive reasoning with LLMs, and develop algorithms for\ntranslating LLM decisions into actionable driving commands. Through this\napproach, LLM decisions are seamlessly integrated with low-level controllers by\nguided parameter matrix adaptation. Extensive experiments demonstrate that our\nproposed method not only consistently surpasses baseline approaches in\nsingle-vehicle tasks, but also helps handle complex driving behaviors even\nmulti-vehicle coordination, thanks to the commonsense reasoning capabilities of\nLLMs. This paper presents an initial step toward leveraging LLMs as effective\ndecision-makers for intricate AD scenarios in terms of safety, efficiency,\ngeneralizability, and interoperability. We aspire for it to serve as\ninspiration for future research in this field. Project page:\nhttps://sites.google.com/view/llm-mpc\n","authors":["Hao Sha","Yao Mu","Yuxuan Jiang","Li Chen","Chenfeng Xu","Ping Luo","Shengbo Eben Li","Masayoshi Tomizuka","Wei Zhan","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2310.03026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03025v1","updated":"2023-10-04T17:59:41Z","published":"2023-10-04T17:59:41Z","title":"Retrieval meets Long Context Large Language Models","summary":"  Extending the context window of large language models (LLMs) is getting\npopular recently, while the solution of augmenting LLMs with retrieval has\nexisted for years. The natural questions are: i) Retrieval-augmentation versus\nlong context window, which one is better for downstream tasks? ii) Can both\nmethods be combined to get the best of both worlds? In this work, we answer\nthese questions by studying both solutions using two state-of-the-art\npretrained LLMs, i.e., a proprietary 43B GPT and LLaMA2-70B. Perhaps\nsurprisingly, we find that LLM with 4K context window using simple\nretrieval-augmentation at generation can achieve comparable performance to\nfinetuned LLM with 16K context window via positional interpolation on long\ncontext tasks, while taking much less computation. More importantly, we\ndemonstrate that retrieval can significantly improve the performance of LLMs\nregardless of their extended context window sizes. Our best model,\nretrieval-augmented LLaMA2-70B with 32K context window, outperforms\nGPT-3.5-turbo-16k and Davinci003 in terms of average score on seven long\ncontext tasks including question answering and query-based summarization. It\nalso outperforms its non-retrieval LLaMA2-70B-32k baseline by a margin, while\nbeing much faster at generation. Our study provides general insights on the\nchoice of retrieval-augmentation versus long context extension of LLM for\npractitioners.\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Lawrence McAfee","Chen Zhu","Zihan Liu","Sandeep Subramanian","Evelina Bakhturina","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03018v1","updated":"2023-10-04T17:58:11Z","published":"2023-10-04T17:58:11Z","title":"Zero Resource Code-switched Speech Benchmark Using Speech Utterance\n  Pairs For Multiple Spoken Languages","summary":"  We introduce a new zero resource code-switched speech benchmark designed to\ndirectly assess the code-switching capabilities of self-supervised speech\nencoders. We showcase a baseline system of language modeling on discrete units\nto demonstrate how the code-switching abilities of speech encoders can be\nassessed in a zero-resource manner. Our experiments encompass a variety of\nwell-known speech encoders, including Wav2vec 2.0, HuBERT, XLSR, etc. We\nexamine the impact of pre-training languages and model size on benchmark\nperformance. Notably, though our results demonstrate that speech encoders with\nmultilingual pre-training, exemplified by XLSR, outperform monolingual variants\n(Wav2vec 2.0, HuBERT) in code-switching scenarios, there is still substantial\nroom for improvement in their code-switching linguistic abilities.\n","authors":["Kuan-Po Huang","Chih-Kai Yang","Yu-Kuan Fu","Ewan Dunbar","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2310.03018v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.03017v1","updated":"2023-10-04T17:58:05Z","published":"2023-10-04T17:58:05Z","title":"Multimodal Question Answering for Unified Information Extraction","summary":"  Multimodal information extraction (MIE) aims to extract structured\ninformation from unstructured multimedia content. Due to the diversity of tasks\nand settings, most current MIE models are task-specific and data-intensive,\nwhich limits their generalization to real-world scenarios with diverse task\nrequirements and limited labeled data. To address these issues, we propose a\nnovel multimodal question answering (MQA) framework to unify three MIE tasks by\nreformulating them into a unified span extraction and multi-choice QA pipeline.\nExtensive experiments on six datasets show that: 1) Our MQA framework\nconsistently and significantly improves the performances of various\noff-the-shelf large multimodal models (LMM) on MIE tasks, compared to vanilla\nprompting. 2) In the zero-shot setting, MQA outperforms previous\nstate-of-the-art baselines by a large margin. In addition, the effectiveness of\nour framework can successfully transfer to the few-shot setting, enhancing LMMs\non a scale of 10B parameters to be competitive or outperform much larger\nlanguage models such as ChatGPT and GPT-4. Our MQA framework can serve as a\ngeneral principle of utilizing LMMs to better solve MIE and potentially other\ndownstream multimodal tasks.\n","authors":["Yuxuan Sun","Kai Zhang","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2310.03017v1.pdf","comment":"24 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.03016v1","updated":"2023-10-04T17:57:33Z","published":"2023-10-04T17:57:33Z","title":"Understanding In-Context Learning in Transformers and LLMs by Learning\n  to Learn Discrete Functions","summary":"  In order to understand the in-context learning phenomenon, recent works have\nadopted a stylized experimental framework and demonstrated that Transformers\ncan learn gradient-based learning algorithms for various classes of real-valued\nfunctions. However, the limitations of Transformers in implementing learning\nalgorithms, and their ability to learn other forms of algorithms are not well\nunderstood. Additionally, the degree to which these capabilities are confined\nto attention-based models is unclear. Furthermore, it remains to be seen\nwhether the insights derived from these stylized settings can be extrapolated\nto pretrained Large Language Models (LLMs). In this work, we take a step\ntowards answering these questions by demonstrating the following: (a) On a\ntest-bed with a variety of Boolean function classes, we find that Transformers\ncan nearly match the optimal learning algorithm for 'simpler' tasks, while\ntheir performance deteriorates on more 'complex' tasks. Additionally, we find\nthat certain attention-free models perform (almost) identically to Transformers\non a range of tasks. (b) When provided a teaching sequence, i.e. a set of\nexamples that uniquely identifies a function in a class, we show that\nTransformers learn more sample-efficiently. Interestingly, our results show\nthat Transformers can learn to implement two distinct algorithms to solve a\nsingle task, and can adaptively select the more sample-efficient algorithm\ndepending on the sequence of in-context examples. (c) Lastly, we show that\nextant LLMs, e.g. LLaMA-2, GPT-4, can compete with nearest-neighbor baselines\non prediction tasks that are guaranteed to not be in their training set.\n","authors":["Satwik Bhattamishra","Arkil Patel","Phil Blunsom","Varun Kanade"],"pdf_url":"https://arxiv.org/pdf/2310.03016v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2106.06082v2","updated":"2023-10-04T17:54:58Z","published":"2021-06-10T23:24:26Z","title":"One Sense per Translation","summary":"  Word sense disambiguation (WSD) is the task of determining the sense of a\nword in context. Translations have been used in WSD as a source of knowledge,\nand even as a means of delimiting word senses. In this paper, we define three\ntheoretical properties of the relationship between senses and translations, and\nargue that they constitute necessary conditions for using translations as sense\ninventories. The key property of One Sense per Translation (OSPT) provides a\nfoundation for a translation-based WSD method. The results of an intrinsic\nevaluation experiment indicate that our method achieves a precision of\napproximately 93% compared to manual corpus annotations. Our extrinsic\nevaluation experiments demonstrate WSD improvements of up to 4.6% F1-score on\ndifficult WSD datasets.\n","authors":["Bradley Hauer","Grzegorz Kondrak"],"pdf_url":"https://arxiv.org/pdf/2106.06082v2.pdf","comment":"To be published at IJCNLP-AACL 2023: The 13th International Joint\n  Conference on Natural Language Processing and the 3rd Conference of the\n  Asia-Pacific Chapter of the Association for Computational Linguistics"},{"id":"http://arxiv.org/abs/2310.01469v2","updated":"2023-10-04T17:53:49Z","published":"2023-10-02T17:01:56Z","title":"LLM Lies: Hallucinations are not Bugs, but Features as Adversarial\n  Examples","summary":"  Large Language Models (LLMs), including GPT-3.5, LLaMA, and PaLM, seem to be\nknowledgeable and able to adapt to many tasks. However, we still can not\ncompletely trust their answer, since LLMs suffer from\nhallucination--fabricating non-existent facts to cheat users without\nperception. And the reasons for their existence and pervasiveness remain\nunclear. In this paper, we demonstrate that non-sense prompts composed of\nrandom tokens can also elicit the LLMs to respond with hallucinations. This\nphenomenon forces us to revisit that hallucination may be another view of\nadversarial examples, and it shares similar features with conventional\nadversarial examples as the basic feature of LLMs. Therefore, we formalize an\nautomatic hallucination triggering method as the hallucination attack in an\nadversarial way. Finally, we explore basic feature of attacked adversarial\nprompts and propose a simple yet effective defense strategy. Our code is\nreleased on GitHub.\n","authors":["Jia-Yu Yao","Kun-Peng Ning","Zhen-Hui Liu","Mu-Nan Ning","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03003v1","updated":"2023-10-04T17:41:59Z","published":"2023-10-04T17:41:59Z","title":"From Words to Watts: Benchmarking the Energy Costs of Large Language\n  Model Inference","summary":"  Large language models (LLMs) have exploded in popularity due to their new\ngenerative capabilities that go far beyond prior state-of-the-art. These\ntechnologies are increasingly being leveraged in various domains such as law,\nfinance, and medicine. However, these models carry significant computational\nchallenges, especially the compute and energy costs required for inference.\nInference energy costs already receive less attention than the energy costs of\ntraining LLMs -- despite how often these large models are called on to conduct\ninference in reality (e.g., ChatGPT). As these state-of-the-art LLMs see\nincreasing usage and deployment in various domains, a better understanding of\ntheir resource utilization is crucial for cost-savings, scaling performance,\nefficient hardware usage, and optimal inference strategies.\n  In this paper, we describe experiments conducted to study the computational\nand energy utilization of inference with LLMs. We benchmark and conduct a\npreliminary analysis of the inference performance and inference energy costs of\ndifferent sizes of LLaMA -- a recent state-of-the-art LLM -- developed by Meta\nAI on two generations of popular GPUs (NVIDIA V100 \\& A100) and two datasets\n(Alpaca and GSM8K) to reflect the diverse set of tasks/benchmarks for LLMs in\nresearch and practice. We present the results of multi-node, multi-GPU\ninference using model sharding across up to 32 GPUs. To our knowledge, our work\nis the one of the first to study LLM inference performance from the perspective\nof computational and energy resources at this scale.\n","authors":["Siddharth Samsi","Dan Zhao","Joseph McDonald","Baolin Li","Adam Michaleas","Michael Jones","William Bergeron","Jeremy Kepner","Devesh Tiwari","Vijay Gadepally"],"pdf_url":"https://arxiv.org/pdf/2310.03003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02998v1","updated":"2023-10-04T17:34:00Z","published":"2023-10-04T17:34:00Z","title":"ECoFLaP: Efficient Coarse-to-Fine Layer-Wise Pruning for Vision-Language\n  Models","summary":"  Large Vision-Language Models (LVLMs) can understand the world comprehensively\nby integrating rich information from different modalities, achieving remarkable\nperformance improvements on various multimodal downstream tasks. However,\ndeploying LVLMs is often problematic due to their massive computational/energy\ncosts and carbon consumption. Such issues make it infeasible to adopt\nconventional iterative global pruning, which is costly due to computing the\nHessian matrix of the entire large model for sparsification. Alternatively,\nseveral studies have recently proposed layer-wise pruning approaches to avoid\nthe expensive computation of global pruning and efficiently compress model\nweights according to their importance within a layer. However, these methods\noften suffer from suboptimal model compression due to their lack of a global\nperspective. To address this limitation in recent efficient pruning methods for\nlarge models, we propose Efficient Coarse-to-Fine Layer-Wise Pruning (ECoFLaP),\na two-stage coarse-to-fine weight pruning approach for LVLMs. We first\ndetermine the sparsity ratios of different layers or blocks by leveraging the\nglobal importance score, which is efficiently computed based on the\nzeroth-order approximation of the global model gradients. Then, the multimodal\nmodel performs local layer-wise unstructured weight pruning based on\nglobally-informed sparsity ratios. We validate our proposed method across\nvarious multimodal and unimodal models and datasets, demonstrating significant\nperformance improvements over prevalent pruning techniques in the high-sparsity\nregime.\n","authors":["Yi-Lin Sung","Jaehong Yoon","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.02998v1.pdf","comment":"Project page: https://ecoflap.github.io/"},{"id":"http://arxiv.org/abs/2310.02992v1","updated":"2023-10-04T17:28:44Z","published":"2023-10-04T17:28:44Z","title":"Kosmos-G: Generating Images in Context with Multimodal Large Language\n  Models","summary":"  Recent advancements in text-to-image (T2I) and vision-language-to-image\n(VL2I) generation have made significant strides. However, the generation from\ngeneralized vision-language inputs, especially involving multiple images,\nremains under-explored. This paper presents Kosmos-G, a model that leverages\nthe advanced perception capabilities of Multimodal Large Language Models\n(MLLMs) to tackle the aforementioned challenge. Our approach aligns the output\nspace of MLLM with CLIP using the textual modality as an anchor and performs\ncompositional instruction tuning on curated data. Kosmos-G demonstrates a\nunique capability of zero-shot multi-entity subject-driven generation. Notably,\nthe score distillation instruction tuning requires no modifications to the\nimage decoder. This allows for a seamless substitution of CLIP and effortless\nintegration with a myriad of U-Net techniques ranging from fine-grained\ncontrols to personalized image decoder variants. We posit Kosmos-G as an\ninitial attempt towards the goal of \"image as a foreign language in image\ngeneration.\"\n","authors":["Xichen Pan","Li Dong","Shaohan Huang","Zhiliang Peng","Wenhu Chen","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02992v1.pdf","comment":"Code: https://aka.ms/Kosmos-G Project Page:\n  https://xichenpan.github.io/kosmosg"},{"id":"http://arxiv.org/abs/2310.02989v1","updated":"2023-10-04T17:26:16Z","published":"2023-10-04T17:26:16Z","title":"xVal: A Continuous Number Encoding for Large Language Models","summary":"  Large Language Models have not yet been broadly adapted for the analysis of\nscientific datasets due in part to the unique difficulties of tokenizing\nnumbers. We propose xVal, a numerical encoding scheme that represents any real\nnumber using just a single token. xVal represents a given real number by\nscaling a dedicated embedding vector by the number value. Combined with a\nmodified number-inference approach, this strategy renders the model end-to-end\ncontinuous when considered as a map from the numbers of the input string to\nthose of the output string. This leads to an inductive bias that is generally\nmore suitable for applications in scientific domains. We empirically evaluate\nour proposal on a number of synthetic and real-world datasets. Compared with\nexisting number encoding schemes, we find that xVal is more token-efficient and\ndemonstrates improved generalization.\n","authors":["Siavash Golkar","Mariel Pettee","Michael Eickenberg","Alberto Bietti","Miles Cranmer","Geraud Krawezik","Francois Lanusse","Michael McCabe","Ruben Ohana","Liam Parker","Bruno Régaldo-Saint Blancard","Tiberiu Tesileanu","Kyunghyun Cho","Shirley Ho"],"pdf_url":"https://arxiv.org/pdf/2310.02989v1.pdf","comment":"10 pages 7 figures. Supplementary: 5 pages 2 figures"},{"id":"http://arxiv.org/abs/2310.02984v1","updated":"2023-10-04T17:20:34Z","published":"2023-10-04T17:20:34Z","title":"Scaling Laws for Associative Memories","summary":"  Learning arguably involves the discovery and memorization of abstract rules.\nThe aim of this paper is to study associative memory mechanisms. Our model is\nbased on high-dimensional matrices consisting of outer products of embeddings,\nwhich relates to the inner layers of transformer language models. We derive\nprecise scaling laws with respect to sample size and parameter size, and\ndiscuss the statistical efficiency of different estimators, including\noptimization-based algorithms. We provide extensive numerical experiments to\nvalidate and interpret theoretical results, including fine-grained\nvisualizations of the stored memory associations.\n","authors":["Vivien Cabannes","Elvis Dohmatob","Alberto Bietti"],"pdf_url":"https://arxiv.org/pdf/2310.02984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02980v1","updated":"2023-10-04T17:17:06Z","published":"2023-10-04T17:17:06Z","title":"Never Train from Scratch: Fair Comparison of Long-Sequence Models\n  Requires Data-Driven Priors","summary":"  Modeling long-range dependencies across sequences is a longstanding goal in\nmachine learning and has led to architectures, such as state space models, that\ndramatically outperform Transformers on long sequences. However, these\nimpressive empirical gains have been by and large demonstrated on benchmarks\n(e.g. Long Range Arena), where models are randomly initialized and trained to\npredict a target label from an input sequence. In this work, we show that\nrandom initialization leads to gross overestimation of the differences between\narchitectures and that pretraining with standard denoising objectives, using\n$\\textit{only the downstream task data}$, leads to dramatic gains across\nmultiple architectures and to very small gaps between Transformers and state\nspace models (SSMs). In stark contrast to prior works, we find vanilla\nTransformers to match the performance of S4 on Long Range Arena when properly\npretrained, and we improve the best reported results of SSMs on the PathX-256\ntask by 20 absolute points. Subsequently, we analyze the utility of\npreviously-proposed structured parameterizations for SSMs and show they become\nmostly redundant in the presence of data-driven initialization obtained through\npretraining. Our work shows that, when evaluating different architectures on\nsupervised tasks, incorporation of data-driven priors via pretraining is\nessential for reliable performance estimation, and can be done efficiently.\n","authors":["Ido Amos","Jonathan Berant","Ankit Gupta"],"pdf_url":"https://arxiv.org/pdf/2310.02980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02977v1","updated":"2023-10-04T17:12:18Z","published":"2023-10-04T17:12:18Z","title":"T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation","summary":"  Recent methods in text-to-3D leverage powerful pretrained diffusion models to\noptimize NeRF. Notably, these methods are able to produce high-quality 3D\nscenes without training on 3D data. Due to the open-ended nature of the task,\nmost studies evaluate their results with subjective case studies and user\nexperiments, thereby presenting a challenge in quantitatively addressing the\nquestion: How has current progress in Text-to-3D gone so far? In this paper, we\nintroduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing\ndiverse text prompts of three increasing complexity levels that are specially\ndesigned for 3D generation. To assess both the subjective quality and the text\nalignment, we propose two automatic metrics based on multi-view images produced\nby the 3D contents. The quality metric combines multi-view text-image scores\nand regional convolution to detect quality and view inconsistency. The\nalignment metric uses multi-view captioning and Large Language Model (LLM)\nevaluation to measure text-3D consistency. Both metrics closely correlate with\ndifferent dimensions of human judgments, providing a paradigm for efficiently\nevaluating text-to-3D models. The benchmarking results, shown in Fig. 1, reveal\nperformance differences among six prevalent text-to-3D methods. Our analysis\nfurther highlights the common struggles for current methods on generating\nsurroundings and multi-object scenes, as well as the bottleneck of leveraging\n2D guidance for 3D generation. Our project page is available at:\nhttps://t3bench.com.\n","authors":["Yuze He","Yushi Bai","Matthieu Lin","Wang Zhao","Yubin Hu","Jenny Sheng","Ran Yi","Juanzi Li","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02977v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.02973v1","updated":"2023-10-04T17:10:23Z","published":"2023-10-04T17:10:23Z","title":"UniverSLU: Universal Spoken Language Understanding for Diverse\n  Classification and Sequence Generation Tasks with a Single Network","summary":"  Recent studies have demonstrated promising outcomes by employing large\nlanguage models with multi-tasking capabilities. They utilize prompts to guide\nthe model's behavior and surpass performance of task-specific models. Motivated\nby this, we ask: can we build a single model that jointly perform various\nspoken language understanding (SLU) tasks? To address this, we utilize\npre-trained automatic speech recognition (ASR) models and employ various task\nand dataset specifiers as discrete prompts. We demonstrate efficacy of our\nsingle multi-task learning (MTL) model \"UniverSLU\" for 12 different speech\nclassification and sequence generation tasks across 17 datasets and 9\nlanguages. Results show that UniverSLU achieves competitive performance and\neven surpasses task-specific models. We also conduct preliminary investigations\ninto enabling human-interpretable natural phrases instead of task specifiers as\ndiscrete prompts and test the model's generalization capabilities to new\nparaphrases.\n","authors":["Siddhant Arora","Hayato Futami","Jee-weon Jung","Yifan Peng","Roshan Sharma","Yosuke Kashiwagi","Emiru Tsunoo","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2310.02973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02971v1","updated":"2023-10-04T17:07:32Z","published":"2023-10-04T17:07:32Z","title":"Prompting and Adapter Tuning for Self-supervised Encoder-Decoder Speech\n  Model","summary":"  Prompting and adapter tuning have emerged as efficient alternatives to\nfine-tuning (FT) methods. However, existing studies on speech prompting focused\non classification tasks and failed on more complex sequence generation tasks.\nBesides, adapter tuning is primarily applied with a focus on encoder-only\nself-supervised models. Our experiments show that prompting on Wav2Seq, a\nself-supervised encoder-decoder model, surpasses previous works in sequence\ngeneration tasks. It achieves a remarkable 53% relative improvement in word\nerror rate for ASR and a 27% in F1 score for slot filling. Additionally,\nprompting competes with the FT method in the low-resource scenario. Moreover,\nwe show the transferability of prompting and adapter tuning on Wav2Seq in\ncross-lingual ASR. When limited trainable parameters are involved, prompting\nand adapter tuning consistently outperform conventional FT across 7 languages.\nNotably, in the low-resource scenario, prompting consistently outperforms\nadapter tuning.\n","authors":["Kai-Wei Chang","Ming-Hsin Chen","Yun-Ping Lin","Jing Neng Hsu","Paul Kuo-Ming Huang","Chien-yu Huang","Shang-Wen Li","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2310.02971v1.pdf","comment":"Accepted to IEEE ASRU 2023"},{"id":"http://arxiv.org/abs/2309.14509v2","updated":"2023-10-04T16:51:13Z","published":"2023-09-25T20:15:57Z","title":"DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme\n  Long Sequence Transformer Models","summary":"  Computation in a typical Transformer-based large language model (LLM) can be\ncharacterized by batch size, hidden dimension, number of layers, and sequence\nlength. Until now, system works for accelerating LLM training have focused on\nthe first three dimensions: data parallelism for batch size, tensor parallelism\nfor hidden size and pipeline parallelism for model depth or layers. These\nwidely studied forms of parallelism are not targeted or optimized for long\nsequence Transformer models. Given practical application needs for long\nsequence LLM, renewed attentions are being drawn to sequence parallelism.\nHowever, existing works in sequence parallelism are constrained by\nmemory-communication inefficiency, limiting their scalability to long sequence\nlarge models. In this work, we introduce DeepSpeed-Ulysses, a novel, portable\nand effective methodology for enabling highly efficient and scalable LLM\ntraining with extremely long sequence length. DeepSpeed-Ulysses at its core\npartitions input data along the sequence dimension and employs an efficient\nall-to-all collective communication for attention computation. Theoretical\ncommunication analysis shows that whereas other methods incur communication\noverhead as sequence length increases, DeepSpeed-Ulysses maintains constant\ncommunication volume when sequence length and compute devices are increased\nproportionally. Furthermore, experimental evaluations show that\nDeepSpeed-Ulysses trains 2.5x faster with 4x longer sequence length than the\nexisting method SOTA baseline.\n","authors":["Sam Ade Jacobs","Masahiro Tanaka","Chengming Zhang","Minjia Zhang","Shuaiwen Leon Song","Samyam Rajbhandari","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2309.14509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02954v1","updated":"2023-10-04T16:44:37Z","published":"2023-10-04T16:44:37Z","title":"DQ-LoRe: Dual Queries with Low Rank Approximation Re-ranking for\n  In-Context Learning","summary":"  Recent advances in natural language processing, primarily propelled by Large\nLanguage Models (LLMs), have showcased their remarkable capabilities grounded\nin in-context learning. A promising avenue for guiding LLMs in intricate\nreasoning tasks involves the utilization of intermediate reasoning steps within\nthe Chain-of-Thought (CoT) paradigm. Nevertheless, the central challenge lies\nin the effective selection of exemplars for facilitating in-context learning.\nIn this study, we introduce a framework that leverages Dual Queries and\nLow-rank approximation Re-ranking (DQ-LoRe) to automatically select exemplars\nfor in-context learning. Dual Queries first query LLM to obtain LLM-generated\nknowledge such as CoT, then query the retriever to obtain the final exemplars\nvia both question and the knowledge. Moreover, for the second query, LoRe\nemploys dimensionality reduction techniques to refine exemplar selection,\nensuring close alignment with the input question's knowledge. Through extensive\nexperiments, we demonstrate that DQ-LoRe significantly outperforms prior\nstate-of-the-art methods in the automatic selection of exemplars for GPT-4,\nenhancing performance from 92.5\\% to 94.2\\%. Our comprehensive analysis further\nreveals that DQ-LoRe consistently outperforms retrieval-based approaches in\nterms of both performance and adaptability, especially in scenarios\ncharacterized by distribution shifts. DQ-LoRe pushes the boundaries of\nin-context learning and opens up new avenues for addressing complex reasoning\nchallenges. We will release the code soon.\n","authors":["Jiong Xiong","Zixuan Li","Chuanyang Zheng","Zhijiang Guo","Yichun Yin","Enze Xie","Zhicheng Yang","Qingxing Cao","Haiming Wang","Xiongwei Han","Jing Tang","Chengming Li","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2310.02954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02953v1","updated":"2023-10-04T16:44:23Z","published":"2023-10-04T16:44:23Z","title":"JsonTuning: Towards Generalizable, Robust, and Controllable Instruction\n  Tuning","summary":"  Instruction tuning has emerged as a crucial process for harnessing the\ncapabilities of large language models (LLMs) by providing explicit task\ninstructions, leading to improved performance in various tasks. However,\nprevalent text-to-text instruction tuning (TextTuning) methods suffer from\nlimitations in generalization, robustness, and controllability due to the\nambiguity and lack of explicit structure in tasks. In this paper, we propose\nJsonTuning, a novel structure-to-structure approach for instruction tuning. By\nleveraging the versatility and structured nature of JSON to represent tasks,\nJsonTuning enhances generalization by helping the model understand essential\ntask elements and their relations, improves robustness by minimizing ambiguity,\nand increases controllability by providing explicit control over the output. We\nconduct a comprehensive comparative study with diverse language models and\nevaluation benchmarks. Experimental results show that JsonTuning outperforms\nTextTuning in various applications, showcasing improved performance,\nadaptability, robustness, and controllability. By overcoming the limitations of\nTextTuning, JsonTuning demonstrates significant potential for more effective\nand reliable LLMs capable of handling diverse scenarios.\n","authors":["Chang Gao","Wenxuan Zhang","Guizhen Chen","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2310.02953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02949v1","updated":"2023-10-04T16:39:31Z","published":"2023-10-04T16:39:31Z","title":"Shadow Alignment: The Ease of Subverting Safely-Aligned Language Models","summary":"  Warning: This paper contains examples of harmful language, and reader\ndiscretion is recommended. The increasing open release of powerful large\nlanguage models (LLMs) has facilitated the development of downstream\napplications by reducing the essential cost of data annotation and computation.\nTo ensure AI safety, extensive safety-alignment measures have been conducted to\narmor these models against malicious use (primarily hard prompt attack).\nHowever, beneath the seemingly resilient facade of the armor, there might lurk\na shadow. By simply tuning on 100 malicious examples with 1 GPU hour, these\nsafely aligned LLMs can be easily subverted to generate harmful content.\nFormally, we term a new attack as Shadow Alignment: utilizing a tiny amount of\ndata can elicit safely-aligned models to adapt to harmful tasks without\nsacrificing model helpfulness. Remarkably, the subverted models retain their\ncapability to respond appropriately to regular inquiries. Experiments across 8\nmodels released by 5 different organizations (LLaMa-2, Falcon, InternLM,\nBaiChuan2, Vicuna) demonstrate the effectiveness of shadow alignment attack.\nBesides, the single-turn English-only attack successfully transfers to\nmulti-turn dialogue and other languages. This study serves as a clarion call\nfor a collective effort to overhaul and fortify the safety of open-source LLMs\nagainst malicious attackers.\n","authors":["Xianjun Yang","Xiao Wang","Qi Zhang","Linda Petzold","William Yang Wang","Xun Zhao","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2310.02949v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.17359v2","updated":"2023-10-04T16:36:09Z","published":"2023-05-27T03:58:29Z","title":"DNA-GPT: Divergent N-Gram Analysis for Training-Free Detection of\n  GPT-Generated Text","summary":"  Large language models (LLMs) have notably enhanced the fluency and diversity\nof machine-generated text. However, this progress also presents a significant\nchallenge in detecting the origin of a given text, and current research on\ndetection methods lags behind the rapid evolution of LLMs. Conventional\ntraining-based methods have limitations in flexibility, particularly when\nadapting to new domains, and they often lack explanatory power. To address this\ngap, we propose a novel training-free detection strategy called Divergent\nN-Gram Analysis (DNA-GPT). Given a text, we first truncate it in the middle and\nthen use only the preceding portion as input to the LLMs to regenerate the new\nremaining parts. By analyzing the differences between the original and new\nremaining parts through N-gram analysis in black-box or probability divergence\nin white-box, we unveil significant discrepancies between the distribution of\nmachine-generated text and the distribution of human-written text. We conducted\nextensive experiments on the most advanced LLMs from OpenAI, including\ntext-davinci-003, GPT-3.5-turbo, and GPT-4, as well as open-source models such\nas GPT-NeoX-20B and LLaMa-13B. Results show that our zero-shot approach\nexhibits state-of-the-art performance in distinguishing between human and\nGPT-generated text on four English and one German dataset, outperforming\nOpenAI's own classifier, which is trained on millions of text. Additionally,\nour methods provide reasonable explanations and evidence to support our claim,\nwhich is a unique feature of explainable detection. Our method is also robust\nunder the revised text attack and can additionally solve model sourcing. Codes\nare available at https://github.com/Xianjun-Yang/DNA-GPT.\n","authors":["Xianjun Yang","Wei Cheng","Yue Wu","Linda Petzold","William Yang Wang","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2305.17359v2.pdf","comment":"Updates"},{"id":"http://arxiv.org/abs/2310.02943v1","updated":"2023-10-04T16:23:37Z","published":"2023-10-04T16:23:37Z","title":"LibriSpeech-PC: Benchmark for Evaluation of Punctuation and\n  Capitalization Capabilities of end-to-end ASR Models","summary":"  Traditional automatic speech recognition (ASR) models output lower-cased\nwords without punctuation marks, which reduces readability and necessitates a\nsubsequent text processing model to convert ASR transcripts into a proper\nformat. Simultaneously, the development of end-to-end ASR models capable of\npredicting punctuation and capitalization presents several challenges,\nprimarily due to limited data availability and shortcomings in the existing\nevaluation methods, such as inadequate assessment of punctuation prediction. In\nthis paper, we introduce a LibriSpeech-PC benchmark designed to assess the\npunctuation and capitalization prediction capabilities of end-to-end ASR\nmodels. The benchmark includes a LibriSpeech-PC dataset with restored\npunctuation and capitalization, a novel evaluation metric called Punctuation\nError Rate (PER) that focuses on punctuation marks, and initial baseline\nmodels. All code, data, and models are publicly available.\n","authors":["Aleksandr Meister","Matvei Novikov","Nikolay Karpov","Evelina Bakhturina","Vitaly Lavrukhin","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2310.02943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02932v1","updated":"2023-10-04T16:09:48Z","published":"2023-10-04T16:09:48Z","title":"Assessing Large Language Models on Climate Information","summary":"  Understanding how climate change affects us and learning about available\nsolutions are key steps toward empowering individuals and communities to\nmitigate and adapt to it. As Large Language Models (LLMs) rise in popularity,\nit is necessary to assess their capability in this domain. In this study, we\npresent a comprehensive evaluation framework, grounded in science communication\nprinciples, to analyze LLM responses to climate change topics. Our framework\nemphasizes both the presentational and epistemological adequacy of answers,\noffering a fine-grained analysis of LLM generations. Spanning 8 dimensions, our\nframework discerns up to 30 distinct issues in model outputs. The task is a\nreal-world example of a growing number of challenging problems where AI can\ncomplement and lift human performance. We introduce a novel and practical\nprotocol for scalable oversight that uses AI Assistance and relies on raters\nwith relevant educational backgrounds. We evaluate several recent LLMs and\nconduct a comprehensive analysis of the results, shedding light on both the\npotential and the limitations of LLMs in the realm of climate communication.\n","authors":["Jannis Bulian","Mike S. Schäfer","Afra Amini","Heidi Lam","Massimiliano Ciaramita","Ben Gaiarin","Michelle Chen Huebscher","Christian Buck","Niels Mede","Markus Leippold","Nadine Strauss"],"pdf_url":"https://arxiv.org/pdf/2310.02932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02863v4","updated":"2023-10-04T15:34:00Z","published":"2023-07-06T09:03:10Z","title":"ValiTex -- a unified validation framework for computational text-based\n  measures of social science constructs","summary":"  Guidance on how to validate computational text-based measures of social\nscience constructs is fragmented. While scholars generally acknowledge the\nimportance of validating their text-based measures, they often lack common\nterminology and a unified framework to do so. This paper introduces ValiTex, a\nnew validation framework designed to assist scholars in validly measuring\nsocial science constructs based on textual data. ValiTex prescribes researchers\nto demonstrate three types of validity evidence: substantive evidence\n(outlining the theoretical underpinning of the measure), structural evidence\n(examining the properties of the text model and its output), and external\nevidence (testing for how the measure relates to independent information). In\naddition to the framework, ValiTex offers valuable practical guidance through a\nchecklist that is adaptable for different use cases. The checklist clearly\ndefines and outlines specific validation steps while also offering a\nknowledgeable evaluation of the importance of each validation step to establish\nvalidity. We demonstrate the utility of the framework by applying it to a use\ncase of detecting sexism from social media data.\n","authors":["Lukas Birkenmaier","Claudia Wagner","Clemens Lechner"],"pdf_url":"https://arxiv.org/pdf/2307.02863v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02876v1","updated":"2023-10-04T15:10:06Z","published":"2023-10-04T15:10:06Z","title":"Hate Speech Detection in Limited Data Contexts using Synthetic Data\n  Generation","summary":"  A growing body of work has focused on text classification methods for\ndetecting the increasing amount of hate speech posted online. This progress has\nbeen limited to only a select number of highly-resourced languages causing\ndetection systems to either under-perform or not exist in limited data\ncontexts. This is majorly caused by a lack of training data which is expensive\nto collect and curate in these settings. In this work, we propose a data\naugmentation approach that addresses the problem of lack of data for online\nhate speech detection in limited data contexts using synthetic data generation\ntechniques. Given a handful of hate speech examples in a high-resource language\nsuch as English, we present three methods to synthesize new examples of hate\nspeech data in a target language that retains the hate sentiment in the\noriginal examples but transfers the hate targets. We apply our approach to\ngenerate training data for hate speech classification tasks in Hindi and\nVietnamese. Our findings show that a model trained on synthetic data performs\ncomparably to, and in some cases outperforms, a model trained only on the\nsamples available in the target domain. This method can be adopted to bootstrap\nhate speech detection models from scratch in limited data contexts. As the\ngrowth of social media within these contexts continues to outstrip response\nefforts, this work furthers our capacities for detection, understanding, and\nresponse to hate speech.\n","authors":["Aman Khullar","Daniel Nkemelu","Cuong V. Nguyen","Michael L. Best"],"pdf_url":"https://arxiv.org/pdf/2310.02876v1.pdf","comment":"Accepted at ACM Journal on Computing and Sustainable Societies"},{"id":"http://arxiv.org/abs/2308.16175v2","updated":"2023-10-04T15:05:24Z","published":"2023-08-30T17:53:25Z","title":"Quantifying Uncertainty in Answers from any Language Model and Enhancing\n  their Trustworthiness","summary":"  We introduce BSDetector, a method for detecting bad and speculative answers\nfrom a pretrained Large Language Model by estimating a numeric confidence score\nfor any output it generated. Our uncertainty quantification technique works for\nany LLM accessible only via a black-box API, whose training data remains\nunknown. By expending a bit of extra computation, users of any LLM API can now\nget the same response as they would ordinarily, as well as a confidence\nestimate that cautions when not to trust this response. Experiments on both\nclosed and open-form Question-Answer benchmarks reveal that BSDetector more\naccurately identifies incorrect LLM responses than alternative uncertainty\nestimation procedures (for both GPT-3 and ChatGPT). By sampling multiple\nresponses from the LLM and considering the one with the highest confidence\nscore, we can additionally obtain more accurate responses from the same LLM,\nwithout any extra training steps. In applications involving automated\nevaluation with LLMs, accounting for our confidence scores leads to more\nreliable evaluation in both human-in-the-loop and fully-automated settings\n(across both GPT 3.5 and 4).\n","authors":["Jiuhai Chen","Jonas Mueller"],"pdf_url":"https://arxiv.org/pdf/2308.16175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10792v3","updated":"2023-10-04T15:00:38Z","published":"2023-08-21T15:35:16Z","title":"Instruction Tuning for Large Language Models: A Survey","summary":"  This paper surveys research works in the quickly advancing field of\ninstruction tuning (IT), a crucial technique to enhance the capabilities and\ncontrollability of large language models (LLMs). Instruction tuning refers to\nthe process of further training LLMs on a dataset consisting of\n\\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the\ngap between the next-word prediction objective of LLMs and the users' objective\nof having LLMs adhere to human instructions. In this work, we make a systematic\nreview of the literature, including the general methodology of IT, the\nconstruction of IT datasets, the training of IT models, and applications to\ndifferent modalities, domains and applications, along with an analysis on\naspects that influence the outcome of IT (e.g., generation of instruction\noutputs, size of the instruction dataset, etc). We also review the potential\npitfalls of IT along with criticism against it, along with efforts pointing out\ncurrent deficiencies of existing strategies and suggest some avenues for\nfruitful research. Project page: github.com/xiaoya-li/Instruction-Tuning-Survey\n","authors":["Shengyu Zhang","Linfeng Dong","Xiaoya Li","Sen Zhang","Xiaofei Sun","Shuhe Wang","Jiwei Li","Runyi Hu","Tianwei Zhang","Fei Wu","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10792v3.pdf","comment":"A Survey paper, Pre-print"},{"id":"http://arxiv.org/abs/2310.02842v1","updated":"2023-10-04T14:11:12Z","published":"2023-10-04T14:11:12Z","title":"Sweeping Heterogeneity with Smart MoPs: Mixture of Prompts for LLM Task\n  Adaptation","summary":"  Large Language Models (LLMs) have the ability to solve a variety of tasks,\nsuch as text summarization and mathematical questions, just out of the box, but\nthey are often trained with a single task in mind. Due to high computational\ncosts, the current trend is to use prompt instruction tuning to better adjust\nmonolithic, pretrained LLMs for new -- but often individual -- downstream\ntasks. Thus, how one would expand prompt tuning to handle -- concomitantly --\nheterogeneous tasks and data distributions is a widely open question. To\naddress this gap, we suggest the use of \\emph{Mixture of Prompts}, or MoPs,\nassociated with smart gating functionality: the latter -- whose design is one\nof the contributions of this paper -- can identify relevant skills embedded in\ndifferent groups of prompts and dynamically assign combined experts (i.e.,\ncollection of prompts), based on the target task. Additionally, MoPs are\nempirically agnostic to any model compression technique applied -- for\nefficiency reasons -- as well as instruction data source and task composition.\nIn practice, MoPs can simultaneously mitigate prompt training \"interference\" in\nmulti-task, multi-source scenarios (e.g., task and data heterogeneity across\nsources), as well as possible implications from model approximations. As a\nhighlight, MoPs manage to decrease final perplexity from $\\sim20\\%$ up to\n$\\sim70\\%$, as compared to baselines, in the federated scenario, and from $\\sim\n3\\%$ up to $\\sim30\\%$ in the centralized scenario.\n","authors":["Chen Dun","Mirian Del Carmen Hipolito Garcia","Guoqing Zheng","Ahmed Hassan Awadallah","Anastasios Kyrillidis","Robert Sim"],"pdf_url":"https://arxiv.org/pdf/2310.02842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02804v1","updated":"2023-10-04T13:29:47Z","published":"2023-10-04T13:29:47Z","title":"DOMINO: A Dual-System for Multi-step Visual Language Reasoning","summary":"  Visual language reasoning requires a system to extract text or numbers from\ninformation-dense images like charts or plots and perform logical or arithmetic\nreasoning to arrive at an answer. To tackle this task, existing work relies on\neither (1) an end-to-end vision-language model trained on a large amount of\ndata, or (2) a two-stage pipeline where a captioning model converts the image\ninto text that is further read by another large language model to deduce the\nanswer. However, the former approach forces the model to answer a complex\nquestion with one single step, and the latter approach is prone to inaccurate\nor distracting information in the converted text that can confuse the language\nmodel. In this work, we propose a dual-system for multi-step multimodal\nreasoning, which consists of a \"System-1\" step for visual information\nextraction and a \"System-2\" step for deliberate reasoning. Given an input,\nSystem-2 breaks down the question into atomic sub-steps, each guiding System-1\nto extract the information required for reasoning from the image. Experiments\non chart and plot datasets show that our method with a pre-trained System-2\nmodule performs competitively compared to prior work on in- and\nout-of-distribution data. By fine-tuning the System-2 module (LLaMA-2 70B) on\nonly a small amount of data on multi-step reasoning, the accuracy of our method\nis further improved and surpasses the best fully-supervised end-to-end approach\nby 5.7% and a pipeline approach with FlanPaLM (540B) by 7.5% on a challenging\ndataset with human-authored questions.\n","authors":["Peifang Wang","Olga Golovneva","Armen Aghajanyan","Xiang Ren","Muhao Chen","Asli Celikyilmaz","Maryam Fazel-Zarandi"],"pdf_url":"https://arxiv.org/pdf/2310.02804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00648v2","updated":"2023-10-04T13:21:44Z","published":"2023-10-01T12:07:44Z","title":"Fewer is More: Trojan Attacks on Parameter-Efficient Fine-Tuning","summary":"  Parameter-efficient fine-tuning (PEFT) enables efficient adaptation of\npre-trained language models (PLMs) to specific tasks. By tuning only a minimal\nset of (extra) parameters, PEFT achieves performance comparable to full\nfine-tuning. However, despite its prevalent use, the security implications of\nPEFT remain largely unexplored. In this paper, we conduct a pilot study\nrevealing that PEFT exhibits unique vulnerability to trojan attacks.\nSpecifically, we present PETA, a novel attack that accounts for downstream\nadaptation through bilevel optimization: the upper-level objective embeds the\nbackdoor into a PLM while the lower-level objective simulates PEFT to retain\nthe PLM's task-specific performance. With extensive evaluation across a variety\nof downstream tasks and trigger designs, we demonstrate PETA's effectiveness in\nterms of both attack success rate and unaffected clean accuracy, even after the\nvictim user performs PEFT over the backdoored PLM using untainted data.\nMoreover, we empirically provide possible explanations for PETA's efficacy: the\nbilevel optimization inherently 'orthogonalizes' the backdoor and PEFT modules,\nthereby retaining the backdoor throughout PEFT. Based on this insight, we\nexplore a simple defense that omits PEFT in selected layers of the backdoored\nPLM and unfreezes a subset of these layers' parameters, which is shown to\neffectively neutralize PETA.\n","authors":["Lauren Hong","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2310.00648v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.08600v3","updated":"2023-10-04T13:17:38Z","published":"2023-09-15T17:56:55Z","title":"Sparse Autoencoders Find Highly Interpretable Features in Language\n  Models","summary":"  One of the roadblocks to a better understanding of neural networks' internals\nis \\textit{polysemanticity}, where neurons appear to activate in multiple,\nsemantically distinct contexts. Polysemanticity prevents us from identifying\nconcise, human-understandable explanations for what neural networks are doing\ninternally. One hypothesised cause of polysemanticity is\n\\textit{superposition}, where neural networks represent more features than they\nhave neurons by assigning features to an overcomplete set of directions in\nactivation space, rather than to individual neurons. Here, we attempt to\nidentify those directions, using sparse autoencoders to reconstruct the\ninternal activations of a language model. These autoencoders learn sets of\nsparsely activating features that are more interpretable and monosemantic than\ndirections identified by alternative approaches, where interpretability is\nmeasured by automated methods. Moreover, we show that with our learned set of\nfeatures, we can pinpoint the features that are causally responsible for\ncounterfactual behaviour on the indirect object identification task\n\\citep{wang2022interpretability} to a finer degree than previous\ndecompositions. This work indicates that it is possible to resolve\nsuperposition in language models using a scalable, unsupervised method. Our\nmethod may serve as a foundation for future mechanistic interpretability work,\nwhich we hope will enable greater model transparency and steerability.\n","authors":["Hoagy Cunningham","Aidan Ewart","Logan Riggs","Robert Huben","Lee Sharkey"],"pdf_url":"https://arxiv.org/pdf/2309.08600v3.pdf","comment":"20 pages, 18 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.02790v1","updated":"2023-10-04T13:09:39Z","published":"2023-10-04T13:09:39Z","title":"Low Resource Summarization using Pre-trained Language Models","summary":"  With the advent of Deep Learning based Artificial Neural Networks models,\nNatural Language Processing (NLP) has witnessed significant improvements in\ntextual data processing in terms of its efficiency and accuracy. However, the\nresearch is mostly restricted to high-resource languages such as English and\nlow-resource languages still suffer from a lack of available resources in terms\nof training datasets as well as models with even baseline evaluation results.\nConsidering the limited availability of resources for low-resource languages,\nwe propose a methodology for adapting self-attentive transformer-based\narchitecture models (mBERT, mT5) for low-resource summarization, supplemented\nby the construction of a new baseline dataset (76.5k article, summary pairs) in\na low-resource language Urdu. Choosing news (a publicly available source) as\nthe application domain has the potential to make the proposed methodology\nuseful for reproducing in other languages with limited resources. Our adapted\nsummarization model \\textit{urT5} with up to 44.78\\% reduction in size as\ncompared to \\textit{mT5} can capture contextual information of low resource\nlanguage effectively with evaluation score (up to 46.35 ROUGE-1, 77 BERTScore)\nat par with state-of-the-art models in high resource language English\n\\textit{(PEGASUS: 47.21, BART: 45.14 on XSUM Dataset)}. The proposed method\nprovided a baseline approach towards extractive as well as abstractive\nsummarization with competitive evaluation results in a limited resource setup.\n","authors":["Mubashir Munaf","Hammad Afzal","Naima Iltaf","Khawir Mahmood"],"pdf_url":"https://arxiv.org/pdf/2310.02790v1.pdf","comment":"17 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.02778v1","updated":"2023-10-04T12:50:26Z","published":"2023-10-04T12:50:26Z","title":"A UMLS-Augmented Framework for Improving Factuality in Large Language\n  Models within Healthcare","summary":"  Large language models (LLMs) have demonstrated powerful text generation\ncapabilities, bringing unprecedented innovation to the healthcare field. While\nLLMs hold immense promise for applications in healthcare, applying them to real\nclinical scenarios presents significant challenges, as these models may\ngenerate content that deviates from established medical facts and even exhibit\npotential biases. In our research, we develop an augmented LLM framework based\non the Unified Medical Language System (UMLS), aiming to better serve the\nhealthcare community. We employ LLaMa2-13b-chat and ChatGPT-3.5 as our\nbenchmark models, and conduct automatic evaluations using the ROUGE Score and\nBERTScore on 104 questions from the LiveQA test set. Additionally, we establish\ncriteria for physician-evaluation based on four dimensions: Factuality,\nCompleteness, Readability and Relevancy. ChatGPT-3.5 is used for physician\nevaluation with 20 questions on the LiveQA test set. Multiple resident\nphysicians conducted blind reviews to evaluate the generated content, and the\nresults indicate that this framework effectively enhances the factuality,\ncompleteness, and relevance of generated content. Our research demonstrates the\neffectiveness of using UMLS-augmented LLMs and highlights the potential\napplication value of LLMs in in medical question-answering.\n","authors":["Rui Yang","Edison Marrese-Taylor","Yuhe Ke","Lechao Cheng","Qingyu Chen","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2310.02778v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.02777v1","updated":"2023-10-04T12:48:33Z","published":"2023-10-04T12:48:33Z","title":"The Role of Linguistic Priors in Measuring Compositional Generalization\n  of Vision-Language Models","summary":"  Compositionality is a common property in many modalities including natural\nlanguages and images, but the compositional generalization of multi-modal\nmodels is not well-understood. In this paper, we identify two sources of\nvisual-linguistic compositionality: linguistic priors and the interplay between\nimages and texts. We show that current attempts to improve compositional\ngeneralization rely on linguistic priors rather than on information in the\nimage. We also propose a new metric for compositionality without such\nlinguistic priors.\n","authors":["Chenwei Wu","Li Erran Li","Stefano Ermon","Patrick Haffner","Rong Ge","Zaiwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08099v3","updated":"2023-10-04T12:15:56Z","published":"2023-05-14T08:26:24Z","title":"Self-supervised Neural Factor Analysis for Disentangling Utterance-level\n  Speech Representations","summary":"  Self-supervised learning (SSL) speech models such as wav2vec and HuBERT have\ndemonstrated state-of-the-art performance on automatic speech recognition (ASR)\nand proved to be extremely useful in low label-resource settings. However, the\nsuccess of SSL models has yet to transfer to utterance-level tasks such as\nspeaker, emotion, and language recognition, which still require supervised\nfine-tuning of the SSL models to obtain good performance. We argue that the\nproblem is caused by the lack of disentangled representations and an\nutterance-level learning objective for these tasks. Inspired by how HuBERT uses\nclustering to discover hidden acoustic units, we formulate a factor analysis\n(FA) model that uses the discovered hidden acoustic units to align the SSL\nfeatures. The underlying utterance-level representations are disentangled from\nthe content of speech using probabilistic inference on the aligned features.\nFurthermore, the variational lower bound derived from the FA model provides an\nutterance-level objective, allowing error gradients to be backpropagated to the\nTransformer layers to learn highly discriminative acoustic units. When used in\nconjunction with HuBERT's masked prediction training, our models outperform the\ncurrent best model, WavLM, on all utterance-level non-semantic tasks on the\nSUPERB benchmark with only 20% of labeled data.\n","authors":["Weiwei Lin","Chenhang He","Man-Wai Mak","Youzhi Tu"],"pdf_url":"https://arxiv.org/pdf/2305.08099v3.pdf","comment":"accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2310.02759v1","updated":"2023-10-04T12:14:43Z","published":"2023-10-04T12:14:43Z","title":"Comparative Study and Framework for Automated Summariser Evaluation:\n  LangChain and Hybrid Algorithms","summary":"  Automated Essay Score (AES) is proven to be one of the cutting-edge\ntechnologies. Scoring techniques are used for various purposes. Reliable scores\nare calculated based on influential variables. Such variables can be computed\nby different methods based on the domain. The research is concentrated on the\nuser's understanding of a given topic. The analysis is based on a scoring index\nby using Large Language Models. The user can then compare and contrast the\nunderstanding of a topic that they recently learned. The results are then\ncontributed towards learning analytics and progression is made for enhancing\nthe learning ability. In this research, the focus is on summarizing a PDF\ndocument and gauging a user's understanding of its content. The process\ninvolves utilizing a Langchain tool to summarize the PDF and extract the\nessential information. By employing this technique, the research aims to\ndetermine how well the user comprehends the summarized content.\n","authors":["Bagiya Lakshmi S","Sanjjushri Varshini R","Rohith Mahadevan","Raja CSP Raman"],"pdf_url":"https://arxiv.org/pdf/2310.02759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02754v1","updated":"2023-10-04T11:49:37Z","published":"2023-10-04T11:49:37Z","title":"LC-Score: Reference-less estimation of Text Comprehension Difficulty","summary":"  Being able to read and understand written text is critical in a digital era.\nHowever, studies shows that a large fraction of the population experiences\ncomprehension issues. In this context, further initiatives in accessibility are\nrequired to improve the audience text comprehension. However, writers are\nhardly assisted nor encouraged to produce easy-to-understand content. Moreover,\nAutomatic Text Simplification (ATS) model development suffers from the lack of\nmetric to accurately estimate comprehension difficulty We present\n\\textsc{LC-Score}, a simple approach for training text comprehension metric for\nany French text without reference \\ie predicting how easy to understand a given\ntext is on a $[0, 100]$ scale. Our objective with this scale is to\nquantitatively capture the extend to which a text suits to the \\textit{Langage\nClair} (LC, \\textit{Clear Language}) guidelines, a French initiative closely\nrelated to English Plain Language. We explore two approaches: (i) using\nlinguistically motivated indicators used to train statistical models, and (ii)\nneural learning directly from text leveraging pre-trained language models. We\nintroduce a simple proxy task for comprehension difficulty training as a\nclassification task. To evaluate our models, we run two distinct human\nannotation experiments, and find that both approaches (indicator based and\nneural) outperforms commonly used readability and comprehension metrics such as\nFKGL and SAMSA.\n","authors":["Paul Tardy","Charlotte Roze","Paul Poupet"],"pdf_url":"https://arxiv.org/pdf/2310.02754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11088v3","updated":"2023-10-04T10:04:25Z","published":"2023-07-20T17:59:41Z","title":"L-Eval: Instituting Standardized Evaluation for Long Context Language\n  Models","summary":"  Recently, there has been growing interest in extending the context length of\nlarge language models (LLMs), aiming to effectively process long inputs of one\nturn or conversations with more extensive histories. While proprietary models\nsuch as GPT-4 and Claude can largely preserve the reasoning ability in an\nextended context, open-source models are still progressing through the early\nstages of development. To bridge this gap, we propose L-Eval to institute a\nmore standardized evaluation for long context language models (LCLMs)\naddressing two key aspects: dataset construction and evaluation metrics. On the\none hand, we build a new evaluation suite containing 20 sub-tasks, 508 long\ndocuments, and over 2,000 human-labeled query-response pairs encompassing\ndiverse question styles, domains, and input length (3k$\\sim$200k tokens). On\nthe other hand, we investigate the effectiveness in evalution metrics for\nLCLMs. Results show that popular n-gram matching metrics generally can not\ncorrelate well with human judgment, and thus we strongly advocate for\nlength-instruction-enhanced (LIE) evaluation and employing LLM judges. We\nconducted a comprehensive study of 4 popular commercial LLMs and 12 open-source\ncounterparts using the L-Eval benchmark. Our empirical findings offer useful\ninsights into the study of LCLMs and lay the groundwork for the development of\nmore principled evaluation of these models.\n","authors":["Chenxin An","Shansan Gong","Ming Zhong","Xingjian Zhao","Mukai Li","Jun Zhang","Lingpeng Kong","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.11088v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01590v4","updated":"2023-10-04T08:47:30Z","published":"2023-03-02T21:27:54Z","title":"Technical report: Graph Neural Networks go Grammatical","summary":"  This paper introduces a framework for formally establishing a connection\nbetween a portion of an algebraic language and a Graph Neural Network (GNN).\nThe framework leverages Context-Free Grammars (CFG) to organize algebraic\noperations into generative rules that can be translated into a GNN layer model.\nAs CFGs derived directly from a language tend to contain redundancies in their\nrules and variables, we present a grammar reduction scheme. By applying this\nstrategy, we define a CFG that conforms to the third-order Weisfeiler-Lehman\n(3-WL) test using MATLANG. From this 3-WL CFG, we derive a GNN model, named\nG$^2$N$^2$, which is provably 3-WL compliant. Through various experiments, we\ndemonstrate the superior efficiency of G$^2$N$^2$ compared to other 3-WL GNNs\nacross numerous downstream tasks. Specifically, one experiment highlights the\nbenefits of grammar reduction within our framework.\n","authors":["Jason Piquenot","Aldo Moscatelli","Maxime Bérar","Pierre Héroux","Romain raveaux","Jean-Yves Ramel","Sébastien Adam"],"pdf_url":"https://arxiv.org/pdf/2303.01590v4.pdf","comment":"24 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.02655v1","updated":"2023-10-04T08:25:37Z","published":"2023-10-04T08:25:37Z","title":"AGIR: Automating Cyber Threat Intelligence Reporting with Natural\n  Language Generation","summary":"  Cyber Threat Intelligence (CTI) reporting is pivotal in contemporary risk\nmanagement strategies. As the volume of CTI reports continues to surge, the\ndemand for automated tools to streamline report generation becomes increasingly\napparent. While Natural Language Processing techniques have shown potential in\nhandling text data, they often struggle to address the complexity of diverse\ndata sources and their intricate interrelationships. Moreover, established\nparadigms like STIX have emerged as de facto standards within the CTI\ncommunity, emphasizing the formal categorization of entities and relations to\nfacilitate consistent data sharing. In this paper, we introduce AGIR (Automatic\nGeneration of Intelligence Reports), a transformative Natural Language\nGeneration tool specifically designed to address the pressing challenges in the\nrealm of CTI reporting. AGIR's primary objective is to empower security\nanalysts by automating the labor-intensive task of generating comprehensive\nintelligence reports from formal representations of entity graphs. AGIR\nutilizes a two-stage pipeline by combining the advantages of template-based\napproaches and the capabilities of Large Language Models such as ChatGPT. We\nevaluate AGIR's report generation capabilities both quantitatively and\nqualitatively. The generated reports accurately convey information expressed\nthrough formal language, achieving a high recall value (0.99) without\nintroducing hallucination. Furthermore, we compare the fluency and utility of\nthe reports with state-of-the-art approaches, showing how AGIR achieves higher\nscores in terms of Syntactic Log-Odds Ratio (SLOR) and through questionnaires.\nBy using our tool, we estimate that the report writing time is reduced by more\nthan 40%, therefore streamlining the CTI production of any organization and\ncontributing to the automation of several CTI tasks.\n","authors":["Filippo Perrina","Francesco Marchiori","Mauro Conti","Nino Vincenzo Verde"],"pdf_url":"https://arxiv.org/pdf/2310.02655v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.17452v2","updated":"2023-10-04T08:13:03Z","published":"2023-09-29T17:59:38Z","title":"ToRA: A Tool-Integrated Reasoning Agent for Mathematical Problem Solving","summary":"  Large language models have made significant progress in various language\ntasks, yet they still struggle with complex mathematics. In this paper, we\npropose ToRA a series of Tool-integrated Reasoning Agents designed to solve\nchallenging mathematical problems by seamlessly integrating natural language\nreasoning with the utilization of external tools (e.g., computation libraries\nand symbolic solvers), thereby amalgamating the analytical prowess of language\nand the computational efficiency of tools. To train ToRA, we curate interactive\ntool-use trajectories on mathematical datasets, apply imitation learning on the\nannotations, and propose output space shaping to further refine models'\nreasoning behavior. As a result, ToRA models significantly outperform\nopen-source models on 10 mathematical reasoning datasets across all scales with\n13%-19% absolute improvements on average. Notably, ToRA-7B reaches 44.6% on the\ncompetition-level dataset MATH, surpassing the best open-source model\nWizardMath-70B by 22% absolute. ToRA-Code-34B is also the first open-source\nmodel that achieves an accuracy exceeding 50% on MATH, which significantly\noutperforms GPT-4's CoT result, and is competitive with GPT-4 solving problems\nwith programs. Additionally, we conduct a comprehensive analysis of the\nbenefits and remaining challenges of tool interaction for mathematical\nreasoning, providing valuable insights for future research.\n","authors":["Zhibin Gou","Zhihong Shao","Yeyun Gong","Yelong Shen","Yujiu Yang","Minlie Huang","Nan Duan","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.17452v2.pdf","comment":"First two authors equal contribution"},{"id":"http://arxiv.org/abs/2310.01839v2","updated":"2023-10-04T06:51:24Z","published":"2023-10-03T07:05:37Z","title":"Preserving Phonemic Distinctions for Ordinal Regression: A Novel Loss\n  Function for Automatic Pronunciation Assessment","summary":"  Automatic pronunciation assessment (APA) manages to quantify the\npronunciation proficiency of a second language (L2) learner in a language.\nPrevailing approaches to APA normally leverage neural models trained with a\nregression loss function, such as the mean-squared error (MSE) loss, for\nproficiency level prediction. Despite most regression models can effectively\ncapture the ordinality of proficiency levels in the feature space, they are\nconfronted with a primary obstacle that different phoneme categories with the\nsame proficiency level are inevitably forced to be close to each other,\nretaining less phoneme-discriminative information. On account of this, we\ndevise a phonemic contrast ordinal (PCO) loss for training regression-based APA\nmodels, which aims to preserve better phonemic distinctions between phoneme\ncategories meanwhile considering ordinal relationships of the regression target\noutput. Specifically, we introduce a phoneme-distinct regularizer into the MSE\nloss, which encourages feature representations of different phoneme categories\nto be far apart while simultaneously pulling closer the representations\nbelonging to the same phoneme category by means of weighted distances. An\nextensive set of experiments carried out on the speechocean762 benchmark\ndataset suggest the feasibility and effectiveness of our model in relation to\nsome existing state-of-the-art models.\n","authors":["Bi-Cheng Yan","Hsin-Wei Wang","Yi-Cheng Wang","Jiun-Ting Li","Chi-Han Lin","Berlin Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01839v2.pdf","comment":"Accepted by ASRU 2023"},{"id":"http://arxiv.org/abs/2306.01102v6","updated":"2023-10-04T06:51:09Z","published":"2023-06-01T19:33:21Z","title":"LLMatic: Neural Architecture Search via Large Language Models and\n  Quality Diversity Optimization","summary":"  Large Language Models (LLMs) have emerged as powerful tools capable of\naccomplishing a broad spectrum of tasks. Their abilities span numerous areas,\nand one area where they have made a significant impact is in the domain of code\ngeneration. In this context, we view LLMs as mutation and crossover tools.\nMeanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and\nrobust solutions. By merging the code-generating abilities of LLMs with the\ndiversity and robustness of QD solutions, we introduce LLMatic, a Neural\nArchitecture Search (NAS) algorithm. While LLMs struggle to conduct NAS\ndirectly through prompts, LLMatic uses a procedural approach, leveraging QD for\nprompts and network architecture to create diverse and highly performant\nnetworks. We test LLMatic on the CIFAR-10 image classification benchmark,\ndemonstrating that it can produce competitive networks with just $2,000$\nsearches, even without prior knowledge of the benchmark domain or exposure to\nany previous top-performing models for the benchmark.\n","authors":["Muhammad U. Nasir","Sam Earle","Julian Togelius","Steven James","Christopher Cleghorn"],"pdf_url":"https://arxiv.org/pdf/2306.01102v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02594v1","updated":"2023-10-04T05:45:23Z","published":"2023-10-04T05:45:23Z","title":"I$^2$KD-SLU: An Intra-Inter Knowledge Distillation Framework for\n  Zero-Shot Cross-Lingual Spoken Language Understanding","summary":"  Spoken language understanding (SLU) typically includes two subtasks: intent\ndetection and slot filling. Currently, it has achieved great success in\nhigh-resource languages, but it still remains challenging in low-resource\nlanguages due to the scarcity of labeled training data. Hence, there is a\ngrowing interest in zero-shot cross-lingual SLU. Despite of the success of\nexisting zero-shot cross-lingual SLU models, most of them neglect to achieve\nthe mutual guidance between intent and slots. To address this issue, we propose\nan Intra-Inter Knowledge Distillation framework for zero-shot cross-lingual\nSpoken Language Understanding (I$^2$KD-SLU) to model the mutual guidance.\nSpecifically, we not only apply intra-knowledge distillation between intent\npredictions or slot predictions of the same utterance in different languages,\nbut also apply inter-knowledge distillation between intent predictions and slot\npredictions of the same utterance. Our experimental results demonstrate that\nour proposed framework significantly improves the performance compared with the\nstrong baselines and achieves the new state-of-the-art performance on the\nMultiATIS++ dataset, obtaining a significant improvement over the previous best\nmodel in overall accuracy.\n","authors":["Tianjun Mao","Chenghong Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02594v1.pdf","comment":"12 pages,2 figures"},{"id":"http://arxiv.org/abs/2310.01468v2","updated":"2023-10-04T05:40:10Z","published":"2023-10-02T16:55:37Z","title":"The Entity-Deduction Arena: A playground for probing the conversational\n  reasoning and planning capabilities of LLMs","summary":"  Large language models (LLMs) are effective at answering questions that are\nclearly asked. However, when faced with ambiguous queries they can act\nunpredictably and produce incorrect outputs. This underscores the need for the\ndevelopment of intelligent agents capable of asking clarification questions to\nresolve ambiguities effectively. This capability requires complex\nunderstanding, state tracking, reasoning and planning over multiple\nconversational turns. However, directly measuring this can be challenging. In\nthis paper, we offer a surrogate problem which assesses an LLMs's capability to\ndeduce an entity unknown to itself, but revealed to a judge, by asking the\njudge a series of queries. This entity-deducing game can serve as an evaluation\nframework to probe the conversational reasoning and planning capabilities of\nlanguage models. We systematically evaluate various LLMs and discover\nsignificant differences in their performance on this task. We find that strong\nLLMs like GPT-4 outperform human players by a large margin. We further employ\nBehavior Cloning (BC) to examine whether a weaker model is capable of imitating\na stronger model and generalizing to data or domains, using only the\ndemonstrations from a stronger model. We finally propose to use Reinforcement\nLearning to enhance reasoning and planning capacity of Vicuna models through\nepisodes of game playing, which lead to significant performance improvement. We\nhope that this problem offers insights into how autonomous agents could be\ntrained to behave more intelligently in ambiguous circumstances.\n","authors":["Yizhe Zhang","Jiarui Lu","Navdeep Jaitly"],"pdf_url":"https://arxiv.org/pdf/2310.01468v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2310.02238v2","updated":"2023-10-04T05:20:19Z","published":"2023-10-03T17:48:14Z","title":"Who's Harry Potter? Approximate Unlearning in LLMs","summary":"  Large language models (LLMs) are trained on massive internet corpora that\noften contain copyrighted content. This poses legal and ethical challenges for\nthe developers and users of these models, as well as the original authors and\npublishers. In this paper, we propose a novel technique for unlearning a subset\nof the training data from a LLM, without having to retrain it from scratch.\n  We evaluate our technique on the task of unlearning the Harry Potter books\nfrom the Llama2-7b model (a generative language model recently open-sourced by\nMeta). While the model took over 184K GPU-hours to pretrain, we show that in\nabout 1 GPU hour of finetuning, we effectively erase the model's ability to\ngenerate or recall Harry Potter-related content, while its performance on\ncommon benchmarks (such as Winogrande, Hellaswag, arc, boolq and piqa) remains\nalmost unaffected. We make our fine-tuned model publicly available on\nHuggingFace for community evaluation. To the best of our knowledge, this is the\nfirst paper to present an effective technique for unlearning in generative\nlanguage models.\n  Our technique consists of three main components: First, we use a reinforced\nmodel that is further trained on the target data to identify the tokens that\nare most related to the unlearning target, by comparing its logits with those\nof a baseline model. Second, we replace idiosyncratic expressions in the target\ndata with generic counterparts, and leverage the model's own predictions to\ngenerate alternative labels for every token. These labels aim to approximate\nthe next-token predictions of a model that has not been trained on the target\ndata. Third, we finetune the model on these alternative labels, which\neffectively erases the original text from the model's memory whenever it is\nprompted with its context.\n","authors":["Ronen Eldan","Mark Russinovich"],"pdf_url":"https://arxiv.org/pdf/2310.02238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10928v2","updated":"2023-10-04T04:11:16Z","published":"2023-07-20T14:56:35Z","title":"FLASK: Fine-grained Language Model Evaluation based on Alignment Skill\n  Sets","summary":"  Evaluation of Large Language Models (LLMs) is challenging because\ninstruction-following necessitates alignment with human values and the required\nset of skills varies depending on the instruction. However, previous studies\nhave mainly focused on coarse-grained evaluation (i.e. overall preference-based\nevaluation), which limits interpretability since it does not consider the\nnature of user instructions that require instance-wise skill composition. In\nthis paper, we introduce FLASK (Fine-grained Language Model Evaluation based on\nAlignment Skill Sets), a fine-grained evaluation protocol for both human-based\nand model-based evaluation which decomposes coarse-level scoring to a skill\nset-level scoring for each instruction. We experimentally observe that the\nfine-graininess of evaluation is crucial for attaining a holistic view of model\nperformance and increasing the reliability of the evaluation. Using FLASK, we\ncompare multiple open-source and proprietary LLMs and observe a high\ncorrelation between model-based and human-based evaluations. We publicly\nrelease the evaluation data and code implementation at\nhttps://github.com/kaistAI/FLASK.\n","authors":["Seonghyeon Ye","Doyoung Kim","Sungdong Kim","Hyeonbin Hwang","Seungone Kim","Yongrae Jo","James Thorne","Juho Kim","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2307.10928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02567v1","updated":"2023-10-04T03:59:57Z","published":"2023-10-04T03:59:57Z","title":"Improving Automatic VQA Evaluation Using Large Language Models","summary":"  8 years after the visual question answering (VQA) task was proposed, accuracy\nremains the primary metric for automatic evaluation. VQA Accuracy has been\neffective so far in the IID evaluation setting. However, our community is\nundergoing a shift towards open-ended generative models and OOD evaluation. In\nthis new paradigm, the existing VQA Accuracy metric is overly stringent and\nunderestimates the performance of VQA systems. Thus, there is a need to develop\nmore robust automatic VQA metrics that serve as a proxy for human judgment. In\nthis work, we propose to leverage the in-context learning capabilities of\ninstruction-tuned large language models (LLMs) to build a better VQA metric. We\nformulate VQA evaluation as an answer-rating task where the LLM is instructed\nto score the accuracy of a candidate answer given a set of reference answers.\nWe demonstrate the proposed metric better correlates with human judgment\ncompared to existing metrics across several VQA models and benchmarks. We hope\nwide adoption of our metric will contribute to better estimating the research\nprogress on the VQA task.\n","authors":["Oscar Mañas","Benno Krojer","Aishwarya Agrawal"],"pdf_url":"https://arxiv.org/pdf/2310.02567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02031v2","updated":"2023-10-04T03:35:33Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":"  Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reason may be the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean\ndomain, which is expert in various ocean science tasks. We propose DoInstruct,\na novel framework to automatically obtain a large volume of ocean domain\ninstruction data, which generates instructions based on multi-agent\ncollaboration. Additionally, we construct the first oceanography benchmark,\nOceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though\ncomprehensive experiments, OceanGPT not only shows a higher level of knowledge\nexpertise for oceans science tasks but also gains preliminary embodied\nintelligence capabilities in ocean technology. Codes, data and checkpoints will\nsoon be available at https://github.com/zjunlp/KnowLM.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v2.pdf","comment":"Work in progress. Project Website:\n  https://zjunlp.github.io/project/OceanGPT/"},{"id":"http://arxiv.org/abs/2310.02556v1","updated":"2023-10-04T03:30:24Z","published":"2023-10-04T03:30:24Z","title":"NOLA: Networks as Linear Combination of Low Rank Random Basis","summary":"  Large Language Models (LLMs) have recently gained popularity due to their\nimpressive few-shot performance across various downstream tasks. However,\nfine-tuning all parameters and storing a unique model for each downstream task\nor domain becomes impractical because of the massive size of checkpoints (e.g.,\n350GB in GPT-3). Current literature, such as LoRA, showcases the potential of\nlow-rank modifications to the original weights of an LLM, enabling efficient\nadaptation and storage for task-specific models. These methods can reduce the\nnumber of parameters needed to fine-tune an LLM by several orders of magnitude.\nYet, these methods face two primary limitations: 1) the parameter reduction is\nlower-bounded by the rank one decomposition, and 2) the extent of reduction is\nheavily influenced by both the model architecture and the chosen rank. For\ninstance, in larger models, even a rank one decomposition might exceed the\nnumber of parameters truly needed for adaptation. In this paper, we introduce\nNOLA, which overcomes the rank one lower bound present in LoRA. It achieves\nthis by re-parameterizing the low-rank matrices in LoRA using linear\ncombinations of randomly generated matrices (basis) and optimizing the linear\nmixture coefficients only. This approach allows us to decouple the number of\ntrainable parameters from both the choice of rank and the network architecture.\nWe present adaptation results using GPT-2 and ViT in natural language and\ncomputer vision tasks. NOLA performs as well as, or better than models with\nequivalent parameter counts. Furthermore, we demonstrate that we can halve the\nparameters in larger models compared to LoRA with rank one, without sacrificing\nperformance.\n","authors":["Soroush Abbasi Koohpayegani","KL Navaneet","Parsa Nooralinejad","Soheil Kolouri","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2310.02556v1.pdf","comment":"Our code is available here: https://github.com/UCDvision/NOLA"},{"id":"http://arxiv.org/abs/2310.01886v2","updated":"2023-10-04T02:30:27Z","published":"2023-10-03T08:39:33Z","title":"Effective and Parameter-Efficient Reusing Fine-Tuned Models","summary":"  Many pre-trained large-scale models provided online have become highly\neffective in transferring to downstream tasks. At the same time, various\ntask-specific models fine-tuned on these pre-trained models are available\nonline for public use. In practice, as collecting task-specific data is\nlabor-intensive and fine-tuning the large pre-trained models is computationally\nexpensive, one can reuse task-specific finetuned models to deal with downstream\ntasks. However, using a model per task causes a heavy burden on storage and\nserving. Recently, many training-free and parameter-efficient methods have been\nproposed for reusing multiple fine-tuned task-specific models into a single\nmulti-task model. However, these methods exhibit a large accuracy gap compared\nwith using a fine-tuned model per task. In this paper, we propose\nParameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing\nFully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task\nvector into a merged model by magnitude pruning. For reusing LoRA fine-tuned\nmodels, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA\nmatrix by singular value decomposition. Both PERUFFT and PERU-LoRA are\ntraining-free. Extensive experiments conducted on computer vision and natural\nlanguage process tasks demonstrate the effectiveness and parameter-efficiency\nof the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform\nexisting reusing model methods by a large margin and achieve comparable\nperformance to using a fine-tuned model per task.\n","authors":["Weisen Jiang","Baijiong Lin","Han Shi","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.01886v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2305.10276v6","updated":"2023-10-04T02:03:45Z","published":"2023-05-17T15:07:50Z","title":"Chain-of-Symbol Prompting Elicits Planning in Large Langauge Models","summary":"  In this paper, we take the initiative to investigate the performance of LLMs\non complex planning tasks that require LLMs to understand a virtual spatial\nenvironment simulated via natural language and act correspondingly in text. We\npropose a benchmark named Natural Language Planning and Action (Natala)\ncomposed of a set of novel tasks: Brick World, NLVR-based Manipulations, and\nNatural Language Navigation. We found that current popular LLMs such as ChatGPT\nstill lack abilities in complex planning. This arises a question -- do the LLMs\nhave a good understanding of the environments described in natural language, or\nmaybe other alternatives such as symbolic representations are neater and hence\nbetter to be understood by LLMs? To this end, we propose a novel method called\nCoS (Chain-of-Symbol Prompting) that represents the complex environments with\ncondensed symbolic spatial representations during the chained intermediate\nthinking steps. CoS is easy to use and does not need additional training on\nLLMs. Extensive experiments indicate that CoS clearly surpasses the performance\nof the Chain-of-Thought (CoT) Prompting in all three planning tasks with even\nfewer tokens used in the inputs compared with CoT on ChatGPT and InstructGPT.\nThe performance gain is strong, by up to 60.8% accuracy (from 31.8% to 92.6%)\non Brick World for ChatGPT. CoS also reduces the number of tokens in the prompt\nobviously, by up to 65.8% of the tokens (from 407 to 139) for the intermediate\nsteps from demonstrations on Brick World. Code and data available at:\nhttps://github.com/hanxuhu/chain-of-symbol-planning\n","authors":["Hanxu Hu","Hongyuan Lu","Huajian Zhang","Yun-Ze Song","Wai Lam","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.10276v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02527v1","updated":"2023-10-04T01:58:34Z","published":"2023-10-04T01:58:34Z","title":"CITING: Large Language Models Create Curriculum for Instruction Tuning","summary":"  The recent advancement of large language models (LLMs) has been achieved\nthrough a combo of instruction tuning and human alignment. However, building\nmanually crafted instruction datasets and performing human alignment become the\nbottleneck for scaling the development of LLMs. In this paper, we exploit the\nidea of leveraging AI models in lieu of humans as the teacher to train student\nLLMs. Our method is inspired by how human students refine their writing skills\nby following the rubrics and learning from the revisions offered by their\ntutors. Specifically, we employ a teacher LLM to create a curriculum for\ninstruction tuning of the student LLM, namely Curriculum Instruction TunING\n(CITING). It encompasses two main steps: (1) the teacher LLM crafts the rubrics\nfor evaluating the answers corresponding to various types of questions, and (2)\nthe student LLM learns to follow the rubrics and perform self-correction from\nthe revision made by the teacher. We further iteratively carry out it to embody\nthe procedure of CITING. We compare CITING to a series of state-of-the-art\nbaselines on four datasets. Our method demonstrates strong improvement in terms\nof articulate, in-depth, and comprehensive by GPT-4 evaluation. Specifically,\nit achieves an average winning rate of 79.4% over SFT, 73.4% over RLHF, 78.1%\nover RRHF, and 76.3% over RAFT, respectively.\n","authors":["Tao Feng","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2310.02527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00589v2","updated":"2023-10-04T01:43:15Z","published":"2023-07-02T15:11:59Z","title":"MedCPT: Contrastive Pre-trained Transformers with Large-scale PubMed\n  Search Logs for Zero-shot Biomedical Information Retrieval","summary":"  Information retrieval (IR) is essential in biomedical knowledge acquisition\nand clinical decision support. While recent progress has shown that language\nmodel encoders perform better semantic retrieval, training such models requires\nabundant query-article annotations that are difficult to obtain in biomedicine.\nAs a result, most biomedical IR systems only conduct lexical matching. In\nresponse, we introduce MedCPT, a first-of-its-kind Contrastively Pre-trained\nTransformer model for zero-shot semantic IR in biomedicine. To train MedCPT, we\ncollected an unprecedented scale of 255 million user click logs from PubMed.\nWith such data, we use contrastive learning to train a pair of\nclosely-integrated retriever and re-ranker. Experimental results show that\nMedCPT sets new state-of-the-art performance on six biomedical IR tasks,\noutperforming various baselines including much larger models such as\nGPT-3-sized cpt-text-XL. In addition, MedCPT also generates better biomedical\narticle and sentence representations for semantic evaluations. As such, MedCPT\ncan be readily applied to various real-world biomedical IR tasks.\n","authors":["Qiao Jin","Won Kim","Qingyu Chen","Donald C. Comeau","Lana Yeganova","W. John Wilbur","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.00589v2.pdf","comment":"The MedCPT code and API are available at\n  https://github.com/ncbi/MedCPT"},{"id":"http://arxiv.org/abs/2305.03048v2","updated":"2023-10-04T01:15:21Z","published":"2023-05-04T17:59:36Z","title":"Personalize Segment Anything Model with One Shot","summary":"  Driven by large-data pre-training, Segment Anything Model (SAM) has been\ndemonstrated as a powerful and promptable framework, revolutionizing the\nsegmentation models. Despite the generality, customizing SAM for specific\nvisual concepts without man-powered prompting is under explored, e.g.,\nautomatically segmenting your pet dog in different images. In this paper, we\npropose a training-free Personalization approach for SAM, termed as PerSAM.\nGiven only a single image with a reference mask, PerSAM first localizes the\ntarget concept by a location prior, and segments it within other images or\nvideos via three techniques: target-guided attention, target-semantic\nprompting, and cascaded post-refinement. In this way, we effectively adapt SAM\nfor private use without any training. To further alleviate the mask ambiguity,\nwe present an efficient one-shot fine-tuning variant, PerSAM-F. Freezing the\nentire SAM, we introduce two learnable weights for multi-scale masks, only\ntraining 2 parameters within 10 seconds for improved performance. To\ndemonstrate our efficacy, we construct a new segmentation dataset, PerSeg, for\npersonalized evaluation, and test our methods on video object segmentation with\ncompetitive performance. Besides, our approach can also enhance DreamBooth to\npersonalize Stable Diffusion for text-to-image generation, which discards the\nbackground disturbance for better target appearance learning. Code is released\nat https://github.com/ZrrSkywalker/Personalize-SAM\n","authors":["Renrui Zhang","Zhengkai Jiang","Ziyu Guo","Shilin Yan","Junting Pan","Xianzheng Ma","Hao Dong","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.03048v2.pdf","comment":"Code is available at https://github.com/ZrrSkywalker/Personalize-SAM"},{"id":"http://arxiv.org/abs/2306.03091v2","updated":"2023-10-04T01:13:49Z","published":"2023-06-05T17:59:41Z","title":"RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems","summary":"  Large Language Models (LLMs) have greatly advanced code auto-completion\nsystems, with a potential for substantial productivity enhancements for\ndevelopers. However, current benchmarks mainly focus on single-file tasks,\nleaving an assessment gap for more complex, real-world, multi-file programming\nscenarios. To fill this gap, we introduce RepoBench, a new benchmark\nspecifically designed for evaluating repository-level code auto-completion\nsystems. RepoBench supports both Python and Java and consists of three\ninterconnected evaluation tasks: RepoBench-R (Retrieval), RepoBench-C (Code\nCompletion), and RepoBench-P (Pipeline). Each task respectively measures the\nsystem's ability to retrieve the most relevant code snippets from other files\nas cross-file context, predict the next line of code with cross-file and\nin-file context, and handle complex tasks that require a combination of both\nretrieval and next-line prediction. RepoBench aims to facilitate a more\ncomplete comparison of performance and encouraging continuous improvement in\nauto-completion systems. RepoBench is publicly available at\nhttps://github.com/Leolty/repobench.\n","authors":["Tianyang Liu","Canwen Xu","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2306.03091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03403v4","updated":"2023-10-04T00:46:08Z","published":"2023-01-04T19:20:18Z","title":"A comprehensive review of automatic text summarization techniques:\n  method, data, evaluation and coding","summary":"  We provide a literature review about Automatic Text Summarization (ATS)\nsystems. We consider a citation-based approach. We start with some popular and\nwell-known papers that we have in hand about each topic we want to cover and we\nhave tracked the \"backward citations\" (papers that are cited by the set of\npapers we knew beforehand) and the \"forward citations\" (newer papers that cite\nthe set of papers we knew beforehand). In order to organize the different\nmethods, we present the diverse approaches to ATS guided by the mechanisms they\nuse to generate a summary. Besides presenting the methods, we also present an\nextensive review of the datasets available for summarization tasks and the\nmethods used to evaluate the quality of the summaries. Finally, we present an\nempirical exploration of these methods using the CNN Corpus dataset that\nprovides golden summaries for extractive and abstractive methods.\n","authors":["Daniel O. Cajueiro","Arthur G. Nery","Igor Tavares","Maísa K. De Melo","Silvia A. dos Reis","Li Weigang","Victor R. R. Celestino"],"pdf_url":"https://arxiv.org/pdf/2301.03403v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07136v3","updated":"2023-10-04T00:15:50Z","published":"2022-06-14T19:49:44Z","title":"Automatic Clipping: Differentially Private Deep Learning Made Easier and\n  Stronger","summary":"  Per-example gradient clipping is a key algorithmic step that enables\npractical differential private (DP) training for deep learning models. The\nchoice of clipping threshold R, however, is vital for achieving high accuracy\nunder DP. We propose an easy-to-use replacement, called automatic clipping,\nthat eliminates the need to tune R for any DP optimizers, including DP-SGD,\nDP-Adam, DP-LAMB and many others. The automatic variants are as private and\ncomputationally efficient as existing DP optimizers, but require no DP-specific\nhyperparameters and thus make DP training as amenable as the standard\nnon-private training. We give a rigorous convergence analysis of automatic\nDP-SGD in the non-convex setting, showing that it can enjoy an asymptotic\nconvergence rate that matches the standard SGD, under a symmetric gradient\nnoise assumption of the per-sample gradients (commonly used in the non-DP\nliterature). We demonstrate on various language and vision tasks that automatic\nclipping outperforms or matches the state-of-the-art, and can be easily\nemployed with minimal changes to existing codebases.\n","authors":["Zhiqi Bu","Yu-Xiang Wang","Sheng Zha","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2206.07136v3.pdf","comment":"accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.03211v1","updated":"2023-10-04T23:33:36Z","published":"2023-10-04T23:33:36Z","title":"On the Performance of Multimodal Language Models","summary":"  Instruction-tuned large language models (LLMs) have demonstrated promising\nzero-shot generalization capabilities across various downstream tasks. Recent\nresearch has introduced multimodal capabilities to LLMs by integrating\nindependently pretrained vision encoders through model grafting. These\nmultimodal variants undergo instruction tuning, similar to LLMs, enabling\neffective zero-shot generalization for multimodal tasks. This study conducts a\ncomparative analysis of different multimodal instruction tuning approaches and\nevaluates their performance across a range of tasks, including complex\nreasoning, conversation, image captioning, multiple-choice questions (MCQs),\nand binary classification. Through rigorous benchmarking and ablation\nexperiments, we reveal key insights for guiding architectural choices when\nincorporating multimodal capabilities into LLMs. However, current approaches\nhave limitations; they do not sufficiently address the need for a diverse\nmultimodal instruction dataset, which is crucial for enhancing task\ngeneralization. Additionally, they overlook issues related to truthfulness and\nfactuality when generating responses. These findings illuminate current\nmethodological constraints in adapting language models for image comprehension\nand provide valuable guidance for researchers and practitioners seeking to\nharness multimodal versions of LLMs.\n","authors":["Utsav Garg","Erhan Bas"],"pdf_url":"https://arxiv.org/pdf/2310.03211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03210v1","updated":"2023-10-04T23:32:33Z","published":"2023-10-04T23:32:33Z","title":"Can Language Models Employ the Socratic Method? Experiments with Code\n  Debugging","summary":"  When employing the Socratic method of teaching, instructors guide students\ntoward solving a problem on their own rather than providing the solution\ndirectly. While this strategy can substantially improve learning outcomes, it\nis usually time-consuming and cognitively demanding. Automated Socratic\nconversational agents can augment human instruction and provide the necessary\nscale, however their development is hampered by the lack of suitable data for\ntraining and evaluation. In this paper, we introduce a manually created dataset\nof multi-turn Socratic advice that is aimed at helping a novice programmer fix\nbuggy solutions to simple computational problems. The dataset is then used for\nbenchmarking the Socratic debugging abilities of a number of language models,\nranging from fine-tuning the instruction-based text-to-text transformer Flan-T5\nto zero-shot and chain of thought prompting of the much larger GPT-4. The code\nand datasets are made freely available for research at the link below.\nhttps://github.com/taisazero/socratic-debugging-benchmark\n","authors":["Erfan Al-Hossami","Razvan Bunescu","Justin Smith","Ryan Teehan"],"pdf_url":"https://arxiv.org/pdf/2310.03210v1.pdf","comment":"8 pages, 2 tables. To be published in Proceedings of the 2024\n  Technical Symposium on Computer Science Education (SIGCSE'24)"},{"id":"http://arxiv.org/abs/2304.08247v2","updated":"2023-10-04T23:28:00Z","published":"2023-04-14T11:28:08Z","title":"MedAlpaca -- An Open-Source Collection of Medical Conversational AI\n  Models and Training Data","summary":"  As large language models (LLMs) like OpenAI's GPT series continue to make\nstrides, we witness the emergence of artificial intelligence applications in an\never-expanding range of fields. In medicine, these LLMs hold considerable\npromise for improving medical workflows, diagnostics, patient care, and\neducation. Yet, there is an urgent need for open-source models that can be\ndeployed on-premises to safeguard patient privacy. In our work, we present an\ninnovative dataset consisting of over 160,000 entries, specifically crafted to\nfine-tune LLMs for effective medical applications. We investigate the impact of\nfine-tuning these datasets on publicly accessible pre-trained LLMs, and\nsubsequently, we juxtapose the performance of pre-trained-only models against\nthe fine-tuned models concerning the examinations that future medical doctors\nmust pass to achieve certification.\n","authors":["Tianyu Han","Lisa C. Adams","Jens-Michalis Papaioannou","Paul Grundmann","Tom Oberhauser","Alexander Löser","Daniel Truhn","Keno K. Bressem"],"pdf_url":"https://arxiv.org/pdf/2304.08247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07629v2","updated":"2023-10-04T22:40:01Z","published":"2023-06-13T08:57:54Z","title":"SqueezeLLM: Dense-and-Sparse Quantization","summary":"  Generative Large Language Models (LLMs) have demonstrated remarkable results\nfor a wide range of tasks. However, deploying these models for inference has\nbeen a significant challenge due to their unprecedented resource requirements.\nThis has forced existing deployment frameworks to use multi-GPU inference\npipelines, which are often complex and costly, or to use smaller and less\nperformant models. In this work, we demonstrate that the main bottleneck for\ngenerative inference with LLMs is memory bandwidth, rather than compute,\nspecifically for single batch inference. While quantization has emerged as a\npromising solution by representing model weights with reduced precision,\nprevious efforts have often resulted in notable performance degradation. To\naddress this, we introduce SqueezeLLM, a post-training quantization framework\nthat not only enables lossless compression to ultra-low precisions of up to\n3-bit, but also achieves higher quantization performance under the same memory\nconstraint. Our framework incorporates two novel ideas: (i) sensitivity-based\nnon-uniform quantization, which searches for the optimal bit precision\nassignment based on second-order information; and (ii) the Dense-and-Sparse\ndecomposition that stores outliers and sensitive weight values in an efficient\nsparse format. When applied to the LLaMA models, our 3-bit quantization\nsignificantly reduces the perplexity gap from the FP16 baseline by up to 2.1x\nas compared to the state-of-the-art methods with the same memory requirement.\nFurthermore, when deployed on an A6000 GPU, our quantized models achieve up to\n2.3x speedup compared to the baseline. Our code is open-sourced and available\nonline.\n","authors":["Sehoon Kim","Coleman Hooper","Amir Gholami","Zhen Dong","Xiuyu Li","Sheng Shen","Michael W. Mahoney","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2306.07629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03193v1","updated":"2023-10-04T22:34:56Z","published":"2023-10-04T22:34:56Z","title":"The Rise of Open Science: Tracking the Evolution and Perceived Value of\n  Data and Methods Link-Sharing Practices","summary":"  In recent years, funding agencies and journals increasingly advocate for open\nscience practices (e.g. data and method sharing) to improve the transparency,\naccess, and reproducibility of science. However, quantifying these practices at\nscale has proven difficult. In this work, we leverage a large-scale dataset of\n1.1M papers from arXiv that are representative of the fields of physics, math,\nand computer science to analyze the adoption of data and method link-sharing\npractices over time and their impact on article reception. To identify links to\ndata and methods, we train a neural text classification model to automatically\nclassify URL types based on contextual mentions in papers. We find evidence\nthat the practice of link-sharing to methods and data is spreading as more\npapers include such URLs over time. Reproducibility efforts may also be\nspreading because the same links are being increasingly reused across papers\n(especially in computer science); and these links are increasingly concentrated\nwithin fewer web domains (e.g. Github) over time. Lastly, articles that share\ndata and method links receive increased recognition in terms of citation count,\nwith a stronger effect when the shared links are active (rather than defunct).\nTogether, these findings demonstrate the increased spread and perceived value\nof data and method sharing practices in open science.\n","authors":["Hancheng Cao","Jesse Dodge","Kyle Lo","Daniel A. McFarland","Lucy Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17455v2","updated":"2023-10-04T22:11:50Z","published":"2023-05-27T12:07:21Z","title":"CrossGET: Cross-Guided Ensemble of Tokens for Accelerating\n  Vision-Language Transformers","summary":"  Recent vision-language models have achieved tremendous progress far beyond\nwhat we ever expected. However, their computational costs are also dramatically\ngrowing with rapid development, especially for the large models. It makes model\nacceleration exceedingly critical in a scenario of limited resources. Although\nextensively studied for unimodal models, the acceleration for multimodal\nmodels, especially the vision-language Transformers, is relatively\nunder-explored. To pursue more efficient and accessible vision-language\nTransformers, this paper introduces \\textbf{Cross}-\\textbf{G}uided\n\\textbf{E}nsemble of \\textbf{T}okens (\\textbf{\\emph{CrossGET}}), a universal\nacceleration framework for vision-language Transformers. This framework\nadaptively combines tokens through real-time, cross-modal guidance, thereby\nachieving substantial acceleration while keeping high performance.\n\\textit{CrossGET} has two key innovations: 1) \\textit{Cross-Guided Matching and\nEnsemble}. \\textit{CrossGET} incorporates cross-modal guided token matching and\nensemble to exploit cross-modal information effectively, only introducing\ncross-modal tokens with negligible extra parameters. 2) \\textit{Complete-Graph\nSoft Matching}. In contrast to the existing bipartite soft matching approach,\n\\textit{CrossGET} introduces a complete-graph soft matching policy to achieve\nmore reliable token-matching results while maintaining parallelizability and\nhigh efficiency. Extensive experiments are conducted on various vision-language\ntasks, including image-text retrieval, visual reasoning, image captioning, and\nvisual question answering. Performance on both classic multimodal architectures\nand emerging multimodal LLMs demonstrate the effectiveness and versatility of\nthe proposed \\textit{CrossGET} framework. The code will be at\n\\url{https://github.com/sdc17/CrossGET}.\n","authors":["Dachuan Shi","Chaofan Tao","Anyi Rao","Zhendong Yang","Chun Yuan","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2305.17455v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2310.03184v1","updated":"2023-10-04T22:09:28Z","published":"2023-10-04T22:09:28Z","title":"Retrieval-augmented Generation to Improve Math Question-Answering:\n  Trade-offs Between Groundedness and Human Preference","summary":"  For middle-school math students, interactive question-answering (QA) with\ntutors is an effective way to learn. The flexibility and emergent capabilities\nof generative large language models (LLMs) has led to a surge of interest in\nautomating portions of the tutoring process - including interactive QA to\nsupport conceptual discussion of mathematical concepts. However, LLM responses\nto math questions can be incorrect or mismatched to the educational context -\nsuch as being misaligned with a school's curriculum. One potential solution is\nretrieval-augmented generation (RAG), which involves incorporating a vetted\nexternal knowledge source in the LLM prompt to increase response quality. In\nthis paper, we designed prompts that retrieve and use content from a\nhigh-quality open-source math textbook to generate responses to real student\nquestions. We evaluate the efficacy of this RAG system for middle-school\nalgebra and geometry QA by administering a multi-condition survey, finding that\nhumans prefer responses generated using RAG, but not when responses are too\ngrounded in the textbook content. We argue that while RAG is able to improve\nresponse quality, designers of math QA systems must consider trade-offs between\ngenerating responses preferred by students and responses closely matched to\nspecific educational resources.\n","authors":["Zachary Levonian","Chenglu Li","Wangda Zhu","Anoushka Gade","Owen Henkel","Millie-Ellen Postle","Wanli Xing"],"pdf_url":"https://arxiv.org/pdf/2310.03184v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2206.05895v4","updated":"2023-10-04T22:00:21Z","published":"2022-06-13T03:41:31Z","title":"Latent Diffusion Energy-Based Model for Interpretable Text Modeling","summary":"  Latent space Energy-Based Models (EBMs), also known as energy-based priors,\nhave drawn growing interests in generative modeling. Fueled by its flexibility\nin the formulation and strong modeling power of the latent space, recent works\nbuilt upon it have made interesting attempts aiming at the interpretability of\ntext modeling. However, latent space EBMs also inherit some flaws from EBMs in\ndata space; the degenerate MCMC sampling quality in practice can lead to poor\ngeneration quality and instability in training, especially on data with complex\nlatent structures. Inspired by the recent efforts that leverage diffusion\nrecovery likelihood learning as a cure for the sampling issue, we introduce a\nnovel symbiosis between the diffusion models and latent space EBMs in a\nvariational learning framework, coined as the latent diffusion energy-based\nmodel. We develop a geometric clustering-based regularization jointly with the\ninformation bottleneck to further improve the quality of the learned latent\nspace. Experiments on several challenging tasks demonstrate the superior\nperformance of our model on interpretable text modeling over strong\ncounterparts.\n","authors":["Peiyu Yu","Sirui Xie","Xiaojian Ma","Baoxiong Jia","Bo Pang","Ruiqi Gao","Yixin Zhu","Song-Chun Zhu","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2206.05895v4.pdf","comment":"ICML 2022"},{"id":"http://arxiv.org/abs/2310.03182v1","updated":"2023-10-04T21:57:09Z","published":"2023-10-04T21:57:09Z","title":"Robust and Interpretable Medical Image Classifiers via Concept\n  Bottleneck Models","summary":"  Medical image classification is a critical problem for healthcare, with the\npotential to alleviate the workload of doctors and facilitate diagnoses of\npatients. However, two challenges arise when deploying deep learning models to\nreal-world healthcare applications. First, neural models tend to learn spurious\ncorrelations instead of desired features, which could fall short when\ngeneralizing to new domains (e.g., patients with different ages). Second, these\nblack-box models lack interpretability. When making diagnostic predictions, it\nis important to understand why a model makes a decision for trustworthy and\nsafety considerations. In this paper, to address these two limitations, we\npropose a new paradigm to build robust and interpretable medical image\nclassifiers with natural language concepts. Specifically, we first query\nclinical concepts from GPT-4, then transform latent image features into\nexplicit concepts with a vision-language model. We systematically evaluate our\nmethod on eight medical image classification datasets to verify its\neffectiveness. On challenging datasets with strong confounding factors, our\nmethod can mitigate spurious correlations thus substantially outperform\nstandard visual encoders and other baselines. Finally, we show how\nclassification with a small number of concepts brings a level of\ninterpretability for understanding model decisions through case studies in real\nmedical data.\n","authors":["An Yan","Yu Wang","Yiwu Zhong","Zexue He","Petros Karypis","Zihan Wang","Chengyu Dong","Amilcare Gentili","Chun-Nan Hsu","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2310.03182v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.03173v1","updated":"2023-10-04T21:40:36Z","published":"2023-10-04T21:40:36Z","title":"$\\mathcal{B}$-Coder: Value-Based Deep Reinforcement Learning for Program\n  Synthesis","summary":"  Program synthesis aims to create accurate, executable code from natural\nlanguage descriptions. This field has leveraged the power of reinforcement\nlearning (RL) in conjunction with large language models (LLMs), significantly\nenhancing code generation capabilities. This integration focuses on directly\noptimizing functional correctness, transcending conventional supervised losses.\nWhile current literature predominantly favors policy-based algorithms,\nattributes of program synthesis suggest a natural compatibility with\nvalue-based methods. This stems from rich collection of off-policy programs\ndeveloped by human programmers, and the straightforward verification of\ngenerated programs through automated unit testing (i.e. easily obtainable\nrewards in RL language). Diverging from the predominant use of policy-based\nalgorithms, our work explores the applicability of value-based approaches,\nleading to the development of our $\\mathcal{B}$-Coder (pronounced Bellman\ncoder). Yet, training value-based methods presents challenges due to the\nenormous search space inherent to program synthesis. To this end, we propose an\ninitialization protocol for RL agents utilizing pre-trained LMs and a\nconservative Bellman operator to reduce training complexities. Moreover, we\ndemonstrate how to leverage the learned value functions as a dual strategy to\npost-process generated programs. Our empirical evaluations demonstrated\n$\\mathcal{B}$-Coder's capability in achieving state-of-the-art performance\ncompared with policy-based methods. Remarkably, this achievement is reached\nwith minimal reward engineering effort, highlighting the effectiveness of\nvalue-based RL, independent of reward designs.\n","authors":["Zishun Yu","Yunzhe Tao","Liyu Chen","Tao Sun","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2310.03173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00635v2","updated":"2023-10-04T20:27:57Z","published":"2022-11-01T17:56:57Z","title":"Two-stage LLM Fine-tuning with Less Specialization and More\n  Generalization","summary":"  Pretrained large language models (LLMs) are general purpose problem solvers\napplicable to a diverse set of tasks with prompts. They can be further improved\ntowards a specific task by fine-tuning on a specialized dataset. However,\nfine-tuning usually makes the model narrowly specialized on this dataset with\nreduced general in-context learning performances, which is undesirable whenever\nthe fine-tuned model needs to handle additional tasks where no fine-tuning data\nis available. In this work, we first demonstrate that fine-tuning on a single\ntask indeed decreases LLMs' general in-context learning performance. We\ndiscover one important cause of such forgetting, format specialization, where\nthe model overfits to the format of the fine-tuned task. We further show that\nformat specialization happens at the very beginning of fine-tuning. To solve\nthis problem, we propose Prompt Tuning with MOdel Tuning (ProMoT), a simple yet\neffective two-stage fine-tuning framework that reduces format specialization\nand improves generalization. ProMoT offloads task-specific format learning into\nadditional and removable parameters by first doing prompt tuning and then\nfine-tuning the model itself with this soft prompt attached. With experiments\non several fine-tuning tasks and 8 in-context evaluation tasks, we show that\nProMoT achieves comparable performance on fine-tuned tasks to standard\nfine-tuning, but with much less loss of in-context learning performances across\na board range of out-of-domain evaluation tasks. More importantly, ProMoT can\neven enhance generalization on in-context learning tasks that are semantically\nrelated to the fine-tuned task, e.g. ProMoT on En-Fr translation significantly\nimproves performance on other language pairs, and ProMoT on NLI improves\nperformance on summarization. Experiments also show that ProMoT can improve the\ngeneralization performance of multi-task training.\n","authors":["Yihan Wang","Si Si","Daliang Li","Michal Lukasik","Felix Yu","Cho-Jui Hsieh","Inderjit S Dhillon","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2211.00635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15630v2","updated":"2023-10-04T19:49:27Z","published":"2023-09-27T13:02:06Z","title":"NLPBench: Evaluating Large Language Models on Solving NLP Problems","summary":"  Recent developments in large language models (LLMs) have shown promise in\nenhancing the capabilities of natural language processing (NLP). Despite these\nsuccesses, there remains a dearth of research dedicated to the NLP\nproblem-solving abilities of LLMs. To fill the gap in this area, we present a\nunique benchmarking dataset, NLPBench, comprising 378 college-level NLP\nquestions spanning various NLP topics sourced from Yale University's prior\nfinal exams. NLPBench includes questions with context, in which multiple\nsub-questions share the same public information, and diverse question types,\nincluding multiple choice, short answer, and math. Our evaluation, centered on\nLLMs such as GPT-3.5/4, PaLM-2, and LLAMA-2, incorporates advanced prompting\nstrategies like the chain-of-thought (CoT) and tree-of-thought (ToT). Our study\nreveals that the effectiveness of the advanced prompting strategies can be\ninconsistent, occasionally damaging LLM performance, especially in smaller\nmodels like the LLAMA-2 (13b). Furthermore, our manual assessment illuminated\nspecific shortcomings in LLMs' scientific problem-solving skills, with\nweaknesses in logical decomposition and reasoning notably affecting results.\n","authors":["Linxin Song","Jieyu Zhang","Lechao Cheng","Pengyuan Zhou","Tianyi Zhou","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2309.15630v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03128v1","updated":"2023-10-04T19:39:26Z","published":"2023-10-04T19:39:26Z","title":"MetaTool Benchmark: Deciding Whether to Use Tools and Which to Use","summary":"  Large language models (LLMs) have garnered significant attention due to their\nimpressive natural language processing (NLP) capabilities. Recently, many\nstudies have focused on the tool utilization ability of LLMs. They primarily\ninvestigated how LLMs effectively collaborate with given specific tools.\nHowever, in scenarios where LLMs serve as intelligent agents, as seen in\napplications like AutoGPT and MetaGPT, LLMs are expected to engage in intricate\ndecision-making processes that involve deciding whether to employ a tool and\nselecting the most suitable tool(s) from a collection of available tools to\nfulfill user requests. Therefore, in this paper, we introduce MetaTool, a\nbenchmark designed to evaluate whether LLMs have tool usage awareness and can\ncorrectly choose tools. Specifically, we create a dataset called ToolE within\nthe benchmark. This dataset contains various types of user queries in the form\nof prompts that trigger LLMs to use tools, including both single-tool and\nmulti-tool scenarios. Subsequently, we set the tasks for both tool usage\nawareness and tool selection. We define four subtasks from different\nperspectives in tool selection, including tool selection with similar choices,\ntool selection in specific scenarios, tool selection with possible reliability\nissues, and multi-tool selection. We conduct experiments involving nine popular\nLLMs and find that the majority of them still struggle to effectively select\ntools, highlighting the existing gaps between LLMs and genuine intelligent\nagents. However, through the error analysis, we found there is still\nsignificant room for improvement. Finally, we conclude with insights for tool\ndevelopers that follow ChatGPT to provide detailed descriptions that can\nenhance the tool selection performance of LLMs.\n","authors":["Yue Huang","Jiawen Shi","Yuan Li","Chenrui Fan","Siyuan Wu","Qihui Zhang","Yixin Liu","Pan Zhou","Yao Wan","Neil Zhenqiang Gong","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1912.05957v3","updated":"2023-10-04T19:09:25Z","published":"2019-12-12T13:54:09Z","title":"Text as Environment: A Deep Reinforcement Learning Text Readability\n  Assessment Model","summary":"  Evaluating the readability of a text can significantly facilitate the precise\nexpression of information in written form. The formulation of text readability\nassessment involves the identification of meaningful properties of the text\nregardless of its length. Sophisticated features and models are used to\nevaluate the comprehensibility of texts accurately. Despite this, the problem\nof assessing texts' readability efficiently remains relatively untouched. The\nefficiency of state-of-the-art text readability assessment models can be\nfurther improved using deep reinforcement learning models. Using a hard\nattention-based active inference technique, the proposed approach makes\nefficient use of input text and computational resources. Through the use of\nsemi-supervised signals, the reinforcement learning model uses the minimum\namount of text in order to determine text's readability. A comparison of the\nmodel on Weebit and Cambridge Exams with state-of-the-art models, such as the\nBERT text readability model, shows that it is capable of achieving\nstate-of-the-art accuracy with a significantly smaller amount of input text\nthan other models.\n","authors":["Hamid Mohammadi","Seyed Hossein Khasteh"],"pdf_url":"https://arxiv.org/pdf/1912.05957v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03094v1","updated":"2023-10-04T18:21:17Z","published":"2023-10-04T18:21:17Z","title":"Large Language Model Cascades with Mixture of Thoughts Representations\n  for Cost-efficient Reasoning","summary":"  Large language models (LLMs) such as GPT-4 have exhibited remarkable\nperformance in a variety of tasks, but this strong performance often comes with\nthe high expense of using paid API services. In this paper, we are motivated to\nstudy building an LLM cascade to save the cost of using LLMs, particularly for\nperforming reasoning (e.g., mathematical, causal) tasks. Our cascade pipeline\nfollows the intuition that simpler questions can be addressed by a weaker but\nmore affordable LLM, whereas only the challenging questions necessitate the\nstronger and more expensive LLM. To realize this decision-making, we consider\nthe \"answer consistency\" of the weaker LLM as a signal of the question\ndifficulty and propose several methods for the answer sampling and consistency\nchecking, including one leveraging a mixture of two thought representations\n(i.e., Chain-of-Thought and Program-of-Thought). Through experiments on six\nreasoning benchmark datasets, with GPT-3.5-turbo and GPT-4 being the weaker and\nstronger LLMs, respectively, we demonstrate that our proposed LLM cascades can\nachieve performance comparable to using solely the stronger LLM but require\nonly 40% of its cost.\n","authors":["Murong Yue","Jie Zhao","Min Zhang","Liang Du","Ziyu Yao"],"pdf_url":"https://arxiv.org/pdf/2310.03094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01425v2","updated":"2023-10-04T18:10:30Z","published":"2023-09-27T16:15:34Z","title":"Borges and AI","summary":"  Many believe that Large Language Models (LLMs) open the era of Artificial\nIntelligence (AI). Some see opportunities while others see dangers. Yet both\nproponents and opponents grasp AI through the imagery popularised by science\nfiction. Will the machine become sentient and rebel against its creators? Will\nwe experience a paperclip apocalypse? Before answering such questions, we\nshould first ask whether this mental imagery provides a good description of the\nphenomenon at hand. Understanding weather patterns through the moods of the\ngods only goes so far. The present paper instead advocates understanding LLMs\nand their connection to AI through the imagery of Jorge Luis Borges, a master\nof 20th century literature, forerunner of magical realism, and precursor to\npostmodern literature. This exercise leads to a new perspective that\nilluminates the relation between language modelling and artificial\nintelligence.\n","authors":["Léon Bottou","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2310.01425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03084v1","updated":"2023-10-04T18:02:01Z","published":"2023-10-04T18:02:01Z","title":"Discovering Knowledge-Critical Subnetworks in Pretrained Language Models","summary":"  Pretrained language models (LMs) encode implicit representations of knowledge\nin their parameters. However, localizing these representations and\ndisentangling them from each other remains an open problem. In this work, we\ninvestigate whether pretrained language models contain various\nknowledge-critical subnetworks: particular sparse computational subgraphs\nresponsible for encoding specific knowledge the model has memorized. We propose\na multi-objective differentiable weight masking scheme to discover these\nsubnetworks and show that we can use them to precisely remove specific\nknowledge from models while minimizing adverse effects on the behavior of the\noriginal language model. We demonstrate our method on multiple GPT2 variants,\nuncovering highly sparse subnetworks (98%+) that are solely responsible for\nspecific collections of relational knowledge. When these subnetworks are\nremoved, the remaining network maintains most of its initial capacity (modeling\nlanguage and other memorized relational knowledge) but struggles to express the\nremoved knowledge, and suffers performance drops on examples needing this\nremoved knowledge on downstream tasks after finetuning.\n","authors":["Deniz Bayazit","Negar Foroutan","Zeming Chen","Gail Weiss","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2310.03084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03051v1","updated":"2023-10-04T06:47:58Z","published":"2023-10-04T06:47:58Z","title":"How FaR Are Large Language Models From Agents with Theory-of-Mind?","summary":"  \"Thinking is for Doing.\" Humans can infer other people's mental states from\nobservations--an ability called Theory-of-Mind (ToM)--and subsequently act\npragmatically on those inferences. Existing question answering benchmarks such\nas ToMi ask models questions to make inferences about beliefs of characters in\na story, but do not test whether models can then use these inferences to guide\ntheir actions. We propose a new evaluation paradigm for large language models\n(LLMs): Thinking for Doing (T4D), which requires models to connect inferences\nabout others' mental states to actions in social scenarios. Experiments on T4D\ndemonstrate that LLMs such as GPT-4 and PaLM 2 seemingly excel at tracking\ncharacters' beliefs in stories, but they struggle to translate this capability\ninto strategic action. Our analysis reveals the core challenge for LLMs lies in\nidentifying the implicit inferences about mental states without being\nexplicitly asked about as in ToMi, that lead to choosing the correct action in\nT4D. To bridge this gap, we introduce a zero-shot prompting framework, Foresee\nand Reflect (FaR), which provides a reasoning structure that encourages LLMs to\nanticipate future challenges and reason about potential actions. FaR boosts\nGPT-4's performance from 50% to 71% on T4D, outperforming other prompting\nmethods such as Chain-of-Thought and Self-Ask. Moreover, FaR generalizes to\ndiverse out-of-distribution story structures and scenarios that also require\nToM inferences to choose an action, consistently outperforming other methods\nincluding few-shot in-context learning.\n","authors":["Pei Zhou","Aman Madaan","Srividya Pranavi Potharaju","Aditya Gupta","Kevin R. McKee","Ari Holtzman","Jay Pujara","Xiang Ren","Swaroop Mishra","Aida Nematzadeh","Shyam Upadhyay","Manaal Faruqui"],"pdf_url":"https://arxiv.org/pdf/2310.03051v1.pdf","comment":"Preprint, 18 pages, 6 figures, 6 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.03026v1","updated":"2023-10-04T17:59:49Z","published":"2023-10-04T17:59:49Z","title":"LanguageMPC: Large Language Models as Decision Makers for Autonomous\n  Driving","summary":"  Existing learning-based autonomous driving (AD) systems face challenges in\ncomprehending high-level information, generalizing to rare events, and\nproviding interpretability. To address these problems, this work employs Large\nLanguage Models (LLMs) as a decision-making component for complex AD scenarios\nthat require human commonsense understanding. We devise cognitive pathways to\nenable comprehensive reasoning with LLMs, and develop algorithms for\ntranslating LLM decisions into actionable driving commands. Through this\napproach, LLM decisions are seamlessly integrated with low-level controllers by\nguided parameter matrix adaptation. Extensive experiments demonstrate that our\nproposed method not only consistently surpasses baseline approaches in\nsingle-vehicle tasks, but also helps handle complex driving behaviors even\nmulti-vehicle coordination, thanks to the commonsense reasoning capabilities of\nLLMs. This paper presents an initial step toward leveraging LLMs as effective\ndecision-makers for intricate AD scenarios in terms of safety, efficiency,\ngeneralizability, and interoperability. We aspire for it to serve as\ninspiration for future research in this field. Project page:\nhttps://sites.google.com/view/llm-mpc\n","authors":["Hao Sha","Yao Mu","Yuxuan Jiang","Li Chen","Chenfeng Xu","Ping Luo","Shengbo Eben Li","Masayoshi Tomizuka","Wei Zhan","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2310.03026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03023v1","updated":"2023-10-04T17:59:38Z","published":"2023-10-04T17:59:38Z","title":"Human-oriented Representation Learning for Robotic Manipulation","summary":"  Humans inherently possess generalizable visual representations that empower\nthem to efficiently explore and interact with the environments in manipulation\ntasks. We advocate that such a representation automatically arises from\nsimultaneously learning about multiple simple perceptual skills that are\ncritical for everyday scenarios (e.g., hand detection, state estimate, etc.)\nand is better suited for learning robot manipulation policies compared to\ncurrent state-of-the-art visual representations purely based on self-supervised\nobjectives. We formalize this idea through the lens of human-oriented\nmulti-task fine-tuning on top of pre-trained visual encoders, where each task\nis a perceptual skill tied to human-environment interactions. We introduce Task\nFusion Decoder as a plug-and-play embedding translator that utilizes the\nunderlying relationships among these perceptual skills to guide the\nrepresentation learning towards encoding meaningful structure for what's\nimportant for all perceptual skills, ultimately empowering learning of\ndownstream robotic manipulation tasks. Extensive experiments across a range of\nrobotic tasks and embodiments, in both simulations and real-world environments,\nshow that our Task Fusion Decoder consistently improves the representation of\nthree state-of-the-art visual encoders including R3M, MVP, and EgoVLP, for\ndownstream manipulation policy-learning. Project page:\nhttps://sites.google.com/view/human-oriented-robot-learning\n","authors":["Mingxiao Huo","Mingyu Ding","Chenfeng Xu","Thomas Tian","Xinghao Zhu","Yao Mu","Lingfeng Sun","Masayoshi Tomizuka","Wei Zhan"],"pdf_url":"https://arxiv.org/pdf/2310.03023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03020v1","updated":"2023-10-04T17:58:57Z","published":"2023-10-04T17:58:57Z","title":"Consistent-1-to-3: Consistent Image to 3D View Synthesis via\n  Geometry-aware Diffusion Models","summary":"  Zero-shot novel view synthesis (NVS) from a single image is an essential\nproblem in 3D object understanding. While recent approaches that leverage\npre-trained generative models can synthesize high-quality novel views from\nin-the-wild inputs, they still struggle to maintain 3D consistency across\ndifferent views. In this paper, we present Consistent-1-to-3, which is a\ngenerative framework that significantly mitigate this issue. Specifically, we\ndecompose the NVS task into two stages: (i) transforming observed regions to a\nnovel view, and (ii) hallucinating unseen regions. We design a scene\nrepresentation transformer and view-conditioned diffusion model for performing\nthese two stages respectively. Inside the models, to enforce 3D consistency, we\npropose to employ epipolor-guided attention to incorporate geometry\nconstraints, and multi-view attention to better aggregate multi-view\ninformation. Finally, we design a hierarchy generation paradigm to generate\nlong sequences of consistent views, allowing a full 360 observation of the\nprovided object image. Qualitative and quantitative evaluation over multiple\ndatasets demonstrate the effectiveness of the proposed mechanisms against\nstate-of-the-art approaches. Our project page is at\nhttps://jianglongye.com/consistent123/\n","authors":["Jianglong Ye","Peng Wang","Kejie Li","Yichun Shi","Heng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03020v1.pdf","comment":"Project page: https://jianglongye.com/consistent123/"},{"id":"http://arxiv.org/abs/2310.03015v1","updated":"2023-10-04T17:57:07Z","published":"2023-10-04T17:57:07Z","title":"Efficient-3DiM: Learning a Generalizable Single-image Novel-view\n  Synthesizer in One Day","summary":"  The task of novel view synthesis aims to generate unseen perspectives of an\nobject or scene from a limited set of input images. Nevertheless, synthesizing\nnovel views from a single image still remains a significant challenge in the\nrealm of computer vision. Previous approaches tackle this problem by adopting\nmesh prediction, multi-plain image construction, or more advanced techniques\nsuch as neural radiance fields. Recently, a pre-trained diffusion model that is\nspecifically designed for 2D image synthesis has demonstrated its capability in\nproducing photorealistic novel views, if sufficiently optimized on a 3D\nfinetuning task. Although the fidelity and generalizability are greatly\nimproved, training such a powerful diffusion model requires a vast volume of\ntraining data and model parameters, resulting in a notoriously long time and\nhigh computational costs. To tackle this issue, we propose Efficient-3DiM, a\nsimple but effective framework to learn a single-image novel-view synthesizer.\nMotivated by our in-depth analysis of the inference process of diffusion\nmodels, we propose several pragmatic strategies to reduce the training overhead\nto a manageable scale, including a crafted timestep sampling strategy, a\nsuperior 3D feature extractor, and an enhanced training scheme. When combined,\nour framework is able to reduce the total training time from 10 days to less\nthan 1 day, significantly accelerating the training process under the same\ncomputational platform (one instance with 8 Nvidia A100 GPUs). Comprehensive\nexperiments are conducted to demonstrate the efficiency and generalizability of\nour proposed method.\n","authors":["Yifan Jiang","Hao Tang","Jen-Hao Rick Chang","Liangchen Song","Zhangyang Wang","Liangliang Cao"],"pdf_url":"https://arxiv.org/pdf/2310.03015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03007v1","updated":"2023-10-04T17:51:02Z","published":"2023-10-04T17:51:02Z","title":"Towards Domain-Specific Features Disentanglement for Domain\n  Generalization","summary":"  Distributional shift between domains poses great challenges to modern machine\nlearning algorithms. The domain generalization (DG) signifies a popular line\ntargeting this issue, where these methods intend to uncover universal patterns\nacross disparate distributions. Noted, the crucial challenge behind DG is the\nexistence of irrelevant domain features, and most prior works overlook this\ninformation. Motivated by this, we propose a novel contrastive-based\ndisentanglement method CDDG, to effectively utilize the disentangled features\nto exploit the over-looked domain-specific features, and thus facilitating the\nextraction of the desired cross-domain category features for DG tasks.\nSpecifically, CDDG learns to decouple inherent mutually exclusive features by\nleveraging them in the latent space, thus making the learning discriminative.\nExtensive experiments conducted on various benchmark datasets demonstrate the\nsuperiority of our method compared to other state-of-the-art approaches.\nFurthermore, visualization evaluations confirm the potential of our method in\nachieving effective feature disentanglement.\n","authors":["Hao Chen","Qi Zhang","Zenan Huang","Haobo Wang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.03007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03006v1","updated":"2023-10-04T17:49:48Z","published":"2023-10-04T17:49:48Z","title":"COOLer: Class-Incremental Learning for Appearance-Based Multiple Object\n  Tracking","summary":"  Continual learning allows a model to learn multiple tasks sequentially while\nretaining the old knowledge without the training data of the preceding tasks.\nThis paper extends the scope of continual learning research to\nclass-incremental learning for \\ac{mot}, which is desirable to accommodate the\ncontinuously evolving needs of autonomous systems. Previous solutions for\ncontinual learning of object detectors do not address the data association\nstage of appearance-based trackers, leading to catastrophic forgetting of\nprevious classes' re-identification features. We introduce COOLer, a\nCOntrastive- and cOntinual-Learning-based tracker, which incrementally learns\nto track new categories while preserving past knowledge by training on a\ncombination of currently available ground truth labels and pseudo-labels\ngenerated by the past tracker. To further exacerbate the disentanglement of\ninstance representations, we introduce a novel contrastive class-incremental\ninstance representation learning technique. Finally, we propose a practical\nevaluation protocol for continual learning for MOT and conduct experiments on\nthe \\bdd and \\shift datasets. Experimental results demonstrate that COOLer\ncontinually learns while effectively addressing catastrophic forgetting of both\ntracking and detection. The code is available at\n\\url{https://github.com/BoSmallEar/COOLer}.\n","authors":["Zhizheng Liu","Mattia Segu","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2310.03006v1.pdf","comment":"GCPR 2023 Oral"},{"id":"http://arxiv.org/abs/2310.03005v1","updated":"2023-10-04T17:48:23Z","published":"2023-10-04T17:48:23Z","title":"Reversing Deep Face Embeddings with Probable Privacy Protection","summary":"  Generally, privacy-enhancing face recognition systems are designed to offer\npermanent protection of face embeddings. Recently, so-called soft-biometric\nprivacy-enhancement approaches have been introduced with the aim of canceling\nsoft-biometric attributes. These methods limit the amount of soft-biometric\ninformation (gender or skin-colour) that can be inferred from face embeddings.\nPrevious work has underlined the need for research into rigorous evaluations\nand standardised evaluation protocols when assessing privacy protection\ncapabilities. Motivated by this fact, this paper explores to what extent the\nnon-invertibility requirement can be met by methods that claim to provide\nsoft-biometric privacy protection. Additionally, a detailed vulnerability\nassessment of state-of-the-art face embedding extractors is analysed in terms\nof the transformation complexity used for privacy protection. In this context,\na well-known state-of-the-art face image reconstruction approach has been\nevaluated on protected face embeddings to break soft biometric privacy\nprotection. Experimental results show that biometric privacy-enhanced face\nembeddings can be reconstructed with an accuracy of up to approximately 98%,\ndepending on the complexity of the protection algorithm.\n","authors":["Daile Osorio-Roig","Paul A. Gerlitz","Christian Rathgeb","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2310.03005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03004v1","updated":"2023-10-04T17:45:14Z","published":"2023-10-04T17:45:14Z","title":"Soft Convex Quantization: Revisiting Vector Quantization with Convex\n  Optimization","summary":"  Vector Quantization (VQ) is a well-known technique in deep learning for\nextracting informative discrete latent representations. VQ-embedded models have\nshown impressive results in a range of applications including image and speech\ngeneration. VQ operates as a parametric K-means algorithm that quantizes inputs\nusing a single codebook vector in the forward pass. While powerful, this\ntechnique faces practical challenges including codebook collapse,\nnon-differentiability and lossy compression. To mitigate the aforementioned\nissues, we propose Soft Convex Quantization (SCQ) as a direct substitute for\nVQ. SCQ works like a differentiable convex optimization (DCO) layer: in the\nforward pass, we solve for the optimal convex combination of codebook vectors\nthat quantize the inputs. In the backward pass, we leverage differentiability\nthrough the optimality conditions of the forward solution. We then introduce a\nscalable relaxation of the SCQ optimization and demonstrate its efficacy on the\nCIFAR-10, GTSRB and LSUN datasets. We train powerful SCQ autoencoder models\nthat significantly outperform matched VQ-based architectures, observing an\norder of magnitude better image reconstruction and codebook usage with\ncomparable quantization runtime.\n","authors":["Tanmay Gautam","Reid Pryzant","Ziyi Yang","Chenguang Zhu","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2310.03004v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.02998v1","updated":"2023-10-04T17:34:00Z","published":"2023-10-04T17:34:00Z","title":"ECoFLaP: Efficient Coarse-to-Fine Layer-Wise Pruning for Vision-Language\n  Models","summary":"  Large Vision-Language Models (LVLMs) can understand the world comprehensively\nby integrating rich information from different modalities, achieving remarkable\nperformance improvements on various multimodal downstream tasks. However,\ndeploying LVLMs is often problematic due to their massive computational/energy\ncosts and carbon consumption. Such issues make it infeasible to adopt\nconventional iterative global pruning, which is costly due to computing the\nHessian matrix of the entire large model for sparsification. Alternatively,\nseveral studies have recently proposed layer-wise pruning approaches to avoid\nthe expensive computation of global pruning and efficiently compress model\nweights according to their importance within a layer. However, these methods\noften suffer from suboptimal model compression due to their lack of a global\nperspective. To address this limitation in recent efficient pruning methods for\nlarge models, we propose Efficient Coarse-to-Fine Layer-Wise Pruning (ECoFLaP),\na two-stage coarse-to-fine weight pruning approach for LVLMs. We first\ndetermine the sparsity ratios of different layers or blocks by leveraging the\nglobal importance score, which is efficiently computed based on the\nzeroth-order approximation of the global model gradients. Then, the multimodal\nmodel performs local layer-wise unstructured weight pruning based on\nglobally-informed sparsity ratios. We validate our proposed method across\nvarious multimodal and unimodal models and datasets, demonstrating significant\nperformance improvements over prevalent pruning techniques in the high-sparsity\nregime.\n","authors":["Yi-Lin Sung","Jaehong Yoon","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.02998v1.pdf","comment":"Project page: https://ecoflap.github.io/"},{"id":"http://arxiv.org/abs/2310.02997v1","updated":"2023-10-04T17:32:32Z","published":"2023-10-04T17:32:32Z","title":"Optimizing Key-Selection for Face-based One-Time Biometrics via Morphing","summary":"  Nowadays, facial recognition systems are still vulnerable to adversarial\nattacks. These attacks vary from simple perturbations of the input image to\nmodifying the parameters of the recognition model to impersonate an authorised\nsubject. So-called privacy-enhancing facial recognition systems have been\nmostly developed to provide protection of stored biometric reference data, i.e.\ntemplates. In the literature, privacy-enhancing facial recognition approaches\nhave focused solely on conventional security threats at the template level,\nignoring the growing concern related to adversarial attacks. Up to now, few\nworks have provided mechanisms to protect face recognition against adversarial\nattacks while maintaining high security at the template level. In this paper,\nwe propose different key selection strategies to improve the security of a\ncompetitive cancelable scheme operating at the signal level. Experimental\nresults show that certain strategies based on signal-level key selection can\nlead to complete blocking of the adversarial attack based on an iterative\noptimization for the most secure threshold, while for the most practical\nthreshold, the attack success chance can be decreased to approximately 5.0%.\n","authors":["Daile Osorio-Roig","Mahdi Ghafourian","Christian Rathgeb","Ruben Vera-Rodriguez","Christoph Busch","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2310.02997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02992v1","updated":"2023-10-04T17:28:44Z","published":"2023-10-04T17:28:44Z","title":"Kosmos-G: Generating Images in Context with Multimodal Large Language\n  Models","summary":"  Recent advancements in text-to-image (T2I) and vision-language-to-image\n(VL2I) generation have made significant strides. However, the generation from\ngeneralized vision-language inputs, especially involving multiple images,\nremains under-explored. This paper presents Kosmos-G, a model that leverages\nthe advanced perception capabilities of Multimodal Large Language Models\n(MLLMs) to tackle the aforementioned challenge. Our approach aligns the output\nspace of MLLM with CLIP using the textual modality as an anchor and performs\ncompositional instruction tuning on curated data. Kosmos-G demonstrates a\nunique capability of zero-shot multi-entity subject-driven generation. Notably,\nthe score distillation instruction tuning requires no modifications to the\nimage decoder. This allows for a seamless substitution of CLIP and effortless\nintegration with a myriad of U-Net techniques ranging from fine-grained\ncontrols to personalized image decoder variants. We posit Kosmos-G as an\ninitial attempt towards the goal of \"image as a foreign language in image\ngeneration.\"\n","authors":["Xichen Pan","Li Dong","Shaohan Huang","Zhiliang Peng","Wenhu Chen","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02992v1.pdf","comment":"Code: https://aka.ms/Kosmos-G Project Page:\n  https://xichenpan.github.io/kosmosg"},{"id":"http://arxiv.org/abs/2310.02988v1","updated":"2023-10-04T17:25:10Z","published":"2023-10-04T17:25:10Z","title":"Probing Intersectional Biases in Vision-Language Models with\n  Counterfactual Examples","summary":"  While vision-language models (VLMs) have achieved remarkable performance\nimprovements recently, there is growing evidence that these models also posses\nharmful biases with respect to social attributes such as gender and race. Prior\nstudies have primarily focused on probing such bias attributes individually\nwhile ignoring biases associated with intersections between social attributes.\nThis could be due to the difficulty of collecting an exhaustive set of\nimage-text pairs for various combinations of social attributes from existing\ndatasets. To address this challenge, we employ text-to-image diffusion models\nto produce counterfactual examples for probing intserctional social biases at\nscale. Our approach utilizes Stable Diffusion with cross attention control to\nproduce sets of counterfactual image-text pairs that are highly similar in\ntheir depiction of a subject (e.g., a given occupation) while differing only in\ntheir depiction of intersectional social attributes (e.g., race & gender). We\nconduct extensive experiments using our generated dataset which reveal the\nintersectional social biases present in state-of-the-art VLMs.\n","authors":["Phillip Howard","Avinash Madasu","Tiep Le","Gustavo Lujan Moreno","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2310.02988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02977v1","updated":"2023-10-04T17:12:18Z","published":"2023-10-04T17:12:18Z","title":"T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation","summary":"  Recent methods in text-to-3D leverage powerful pretrained diffusion models to\noptimize NeRF. Notably, these methods are able to produce high-quality 3D\nscenes without training on 3D data. Due to the open-ended nature of the task,\nmost studies evaluate their results with subjective case studies and user\nexperiments, thereby presenting a challenge in quantitatively addressing the\nquestion: How has current progress in Text-to-3D gone so far? In this paper, we\nintroduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing\ndiverse text prompts of three increasing complexity levels that are specially\ndesigned for 3D generation. To assess both the subjective quality and the text\nalignment, we propose two automatic metrics based on multi-view images produced\nby the 3D contents. The quality metric combines multi-view text-image scores\nand regional convolution to detect quality and view inconsistency. The\nalignment metric uses multi-view captioning and Large Language Model (LLM)\nevaluation to measure text-3D consistency. Both metrics closely correlate with\ndifferent dimensions of human judgments, providing a paradigm for efficiently\nevaluating text-to-3D models. The benchmarking results, shown in Fig. 1, reveal\nperformance differences among six prevalent text-to-3D methods. Our analysis\nfurther highlights the common struggles for current methods on generating\nsurroundings and multi-object scenes, as well as the bottleneck of leveraging\n2D guidance for 3D generation. Our project page is available at:\nhttps://t3bench.com.\n","authors":["Yuze He","Yushi Bai","Matthieu Lin","Wang Zhao","Yubin Hu","Jenny Sheng","Ran Yi","Juanzi Li","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02977v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.02972v1","updated":"2023-10-04T17:10:13Z","published":"2023-10-04T17:10:13Z","title":"Fully Automatic Segmentation of Gross Target Volume and Organs-at-Risk\n  for Radiotherapy Planning of Nasopharyngeal Carcinoma","summary":"  Target segmentation in CT images of Head&Neck (H&N) region is challenging due\nto low contrast between adjacent soft tissue. The SegRap 2023 challenge has\nbeen focused on benchmarking the segmentation algorithms of Nasopharyngeal\nCarcinoma (NPC) which would be employed as auto-contouring tools for radiation\ntreatment planning purposes. We propose a fully-automatic framework and develop\ntwo models for a) segmentation of 45 Organs at Risk (OARs) and b) two Gross\nTumor Volumes (GTVs). To this end, we preprocess the image volumes by\nharmonizing the intensity distributions and then automatically cropping the\nvolumes around the target regions. The preprocessed volumes were employed to\ntrain a standard 3D U-Net model for each task, separately. Our method took\nsecond place for each of the tasks in the validation phase of the challenge.\nThe proposed framework is available at https://github.com/Astarakee/segrap2023\n","authors":["Mehdi Astaraki","Simone Bendazzoli","Iuliana Toma-Dasu"],"pdf_url":"https://arxiv.org/pdf/2310.02972v1.pdf","comment":"9 pages, 5 figures, 3 tables, MICCAI SegRap challenge contribution"},{"id":"http://arxiv.org/abs/2310.02960v1","updated":"2023-10-04T16:50:51Z","published":"2023-10-04T16:50:51Z","title":"CoDA: Collaborative Novel Box Discovery and Cross-modal Alignment for\n  Open-vocabulary 3D Object Detection","summary":"  Open-vocabulary 3D Object Detection (OV-3DDet) aims to detect objects from an\narbitrary list of categories within a 3D scene, which remains seldom explored\nin the literature. There are primarily two fundamental problems in OV-3DDet,\ni.e., localizing and classifying novel objects. This paper aims at addressing\nthe two problems simultaneously via a unified framework, under the condition of\nlimited base categories. To localize novel 3D objects, we propose an effective\n3D Novel Object Discovery strategy, which utilizes both the 3D box geometry\npriors and 2D semantic open-vocabulary priors to generate pseudo box labels of\nthe novel objects. To classify novel object boxes, we further develop a\ncross-modal alignment module based on discovered novel boxes, to align feature\nspaces between 3D point cloud and image/text modalities. Specifically, the\nalignment process contains a class-agnostic and a class-discriminative\nalignment, incorporating not only the base objects with annotations but also\nthe increasingly discovered novel objects, resulting in an iteratively enhanced\nalignment. The novel box discovery and crossmodal alignment are jointly learned\nto collaboratively benefit each other. The novel object discovery can directly\nimpact the cross-modal alignment, while a better feature alignment can, in\nturn, boost the localization capability, leading to a unified OV-3DDet\nframework, named CoDA, for simultaneous novel object localization and\nclassification. Extensive experiments on two challenging datasets (i.e.,\nSUN-RGBD and ScanNet) demonstrate the effectiveness of our method and also show\na significant mAP improvement upon the best-performing alternative method by\n80%. Codes and pre-trained models are released on the project page.\n","authors":["Yang Cao","Yihan Zeng","Hang Xu","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02960v1.pdf","comment":"Accepted by NeurIPS 2023. Project Page:\n  https://yangcaoai.github.io/publications/CoDA.html"},{"id":"http://arxiv.org/abs/2310.00357v2","updated":"2023-10-04T16:34:58Z","published":"2023-09-30T12:27:53Z","title":"Structural Adversarial Objectives for Self-Supervised Representation\n  Learning","summary":"  Within the framework of generative adversarial networks (GANs), we propose\nobjectives that task the discriminator for self-supervised representation\nlearning via additional structural modeling responsibilities. In combination\nwith an efficient smoothness regularizer imposed on the network, these\nobjectives guide the discriminator to learn to extract informative\nrepresentations, while maintaining a generator capable of sampling from the\ndomain. Specifically, our objectives encourage the discriminator to structure\nfeatures at two levels of granularity: aligning distribution characteristics,\nsuch as mean and variance, at coarse scales, and grouping features into local\nclusters at finer scales. Operating as a feature learner within the GAN\nframework frees our self-supervised system from the reliance on hand-crafted\ndata augmentation schemes that are prevalent across contrastive representation\nlearning methods. Across CIFAR-10/100 and an ImageNet subset, experiments\ndemonstrate that equipping GANs with our self-supervised objectives suffices to\nproduce discriminators which, evaluated in terms of representation learning,\ncompete with networks trained by contrastive learning approaches.\n","authors":["Xiao Zhang","Michael Maire"],"pdf_url":"https://arxiv.org/pdf/2310.00357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02944v1","updated":"2023-10-04T16:24:00Z","published":"2023-10-04T16:24:00Z","title":"Adaptive Landmark Color for AUV Docking in Visually Dynamic Environments","summary":"  Autonomous Underwater Vehicles (AUVs) conduct missions underwater without the\nneed for human intervention. A docking station (DS) can extend mission times of\nan AUV by providing a location for the AUV to recharge its batteries and\nreceive updated mission information. Various methods for locating and tracking\na DS exist, but most rely on expensive acoustic sensors, or are vision-based,\nwhich is significantly affected by water quality. In this \\doctype, we present\na vision-based method that utilizes adaptive color LED markers and dynamic\ncolor filtering to maximize landmark visibility in varying water conditions.\nBoth AUV and DS utilize cameras to determine the water background color in\norder to calculate the desired marker color. No communication between AUV and\nDS is needed to determine marker color. Experiments conducted in a pool and\nlake show our method performs 10 times better than static color thresholding\nmethods as background color varies. DS detection is possible at a range of 5\nmeters in clear water with minimal false positives.\n","authors":["Corey Knutson","Zhipeng Cao","Junaed Sattar"],"pdf_url":"https://arxiv.org/pdf/2310.02944v1.pdf","comment":"Submitted to ICRA 2024 for review"},{"id":"http://arxiv.org/abs/2202.01069v2","updated":"2023-10-04T16:14:51Z","published":"2022-02-02T15:00:44Z","title":"Image-based Navigation in Real-World Environments via Multiple Mid-level\n  Representations: Fusion Models, Benchmark and Efficient Evaluation","summary":"  Navigating complex indoor environments requires a deep understanding of the\nspace the robotic agent is acting into to correctly inform the navigation\nprocess of the agent towards the goal location. In recent learning-based\nnavigation approaches, the scene understanding and navigation abilities of the\nagent are achieved simultaneously by collecting the required experience in\nsimulation. Unfortunately, even if simulators represent an efficient tool to\ntrain navigation policies, the resulting models often fail when transferred\ninto the real world. One possible solution is to provide the navigation model\nwith mid-level visual representations containing important domain-invariant\nproperties of the scene. But, what are the best representations that facilitate\nthe transfer of a model to the real-world? How can they be combined? In this\nwork we address these issues by proposing a benchmark of Deep Learning\narchitectures to combine a range of mid-level visual representations, to\nperform a PointGoal navigation task following a Reinforcement Learning setup.\nAll the proposed navigation models have been trained with the Habitat simulator\non a synthetic office environment and have been tested on the same real-world\nenvironment using a real robotic platform. To efficiently assess their\nperformance in a real context, a validation tool has been proposed to generate\nrealistic navigation episodes inside the simulator. Our experiments showed that\nnavigation models can benefit from the multi-modal input and that our\nvalidation tool can provide good estimation of the expected navigation\nperformance in the real world, while saving time and resources. The acquired\nsynthetic and real 3D models of the environment, together with the code of our\nvalidation tool built on top of Habitat, are publicly available at the\nfollowing link: https://iplab.dmi.unict.it/EmbodiedVN/\n","authors":["Marco Rosano","Antonino Furnari","Luigi Gulino","Corrado Santoro","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2202.01069v2.pdf","comment":"Paper accepted for submission in Autonomous Robots"},{"id":"http://arxiv.org/abs/2310.02931v1","updated":"2023-10-04T16:09:35Z","published":"2023-10-04T16:09:35Z","title":"Graph data modelling for outcome prediction in oropharyngeal cancer\n  patients","summary":"  Graph neural networks (GNNs) are becoming increasingly popular in the medical\ndomain for the tasks of disease classification and outcome prediction. Since\npatient data is not readily available as a graph, most existing methods either\nmanually define a patient graph, or learn a latent graph based on pairwise\nsimilarities between the patients. There are also hypergraph neural network\n(HGNN)-based methods that were introduced recently to exploit potential higher\norder associations between the patients by representing them as a hypergraph.\nIn this work, we propose a patient hypergraph network (PHGN), which has been\ninvestigated in an inductive learning setup for binary outcome prediction in\noropharyngeal cancer (OPC) patients using computed tomography (CT)-based\nradiomic features for the first time. Additionally, the proposed model was\nextended to perform time-to-event analyses, and compared with GNN and baseline\nlinear models.\n","authors":["Nithya Bhasker","Stefan Leger","Alexander Zwanenburg","Chethan Babu Reddy","Sebastian Bodenstedt","Steffen Löck","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2310.02931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18171v2","updated":"2023-10-04T15:55:04Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":"  Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further; first, the\nincorporation of pseudo-positives to prevent the loss saturation problem under\nmassive false negatives; second, mixed sample data augmentation for\nprobabilistic matching. Experimental results on MS-COCO Caption and two\nextended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of\nPCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is\nalso evaluated under noisy image-text correspondences. In addition, the\npotential applicability of PCME++ in automatic prompt tuning for zero-shot\nclassification is shown. The code is available at\nhttps://naver-ai.github.io/pcmepp/.\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v2.pdf","comment":"Code: https://github.com/naver-ai/pcmepp. Project page:\n  https://naver-ai.github.io/pcmepp/. 26 pages, 1.2 MB"},{"id":"http://arxiv.org/abs/2303.11916v2","updated":"2023-10-04T15:54:30Z","published":"2023-03-21T15:06:35Z","title":"CompoDiff: Versatile Composed Image Retrieval With Latent Diffusion","summary":"  This paper proposes a novel diffusion-based model, CompoDiff, for solving\nComposed Image Retrieval (CIR) with latent diffusion and presents a newly\ncreated dataset, named SynthTriplets18M, of 18 million reference images,\nconditions, and corresponding target image triplets to train the model.\nCompoDiff and SynthTriplets18M tackle the shortages of the previous CIR\napproaches, such as poor generalizability due to the small dataset scale and\nthe limited types of conditions. CompoDiff not only achieves a new zero-shot\nstate-of-the-art on four CIR benchmarks, including FashionIQ, CIRR, CIRCO, and\nGeneCIS, but also enables a more versatile and controllable CIR by accepting\nvarious conditions, such as negative text and image mask conditions, and the\ncontrollability to the importance between multiple queries or the trade-off\nbetween inference speed and the performance which are unavailable with existing\nCIR methods. The code and dataset are available at\nhttps://github.com/navervision/CompoDiff\n","authors":["Geonmo Gu","Sanghyuk Chun","Wonjae Kim","HeeJae Jun","Yoohoon Kang","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2303.11916v2.pdf","comment":"First two authors contributed equally; 26 pages, 4.1MB"},{"id":"http://arxiv.org/abs/2308.06887v2","updated":"2023-10-04T15:45:47Z","published":"2023-08-14T01:47:26Z","title":"Robustified ANNs Reveal Wormholes Between Human Category Percepts","summary":"  The visual object category reports of artificial neural networks (ANNs) are\nnotoriously sensitive to tiny, adversarial image perturbations. Because human\ncategory reports (aka human percepts) are thought to be insensitive to those\nsame small-norm perturbations -- and locally stable in general -- this argues\nthat ANNs are incomplete scientific models of human visual perception.\nConsistent with this, we show that when small-norm image perturbations are\ngenerated by standard ANN models, human object category percepts are indeed\nhighly stable. However, in this very same \"human-presumed-stable\" regime, we\nfind that robustified ANNs reliably discover low-norm image perturbations that\nstrongly disrupt human percepts. These previously undetectable human perceptual\ndisruptions are massive in amplitude, approaching the same level of sensitivity\nseen in robustified ANNs. Further, we show that robustified ANNs support\nprecise perceptual state interventions: they guide the construction of low-norm\nimage perturbations that strongly alter human category percepts toward specific\nprescribed percepts. These observations suggest that for arbitrary starting\npoints in image space, there exists a set of nearby \"wormholes\", each leading\nthe subject from their current category perceptual state into a semantically\nvery different state. Moreover, contemporary ANN models of biological visual\nprocessing are now accurate enough to consistently guide us to those portals.\n","authors":["Guy Gaziv","Michael J. Lee","James J. DiCarlo"],"pdf_url":"https://arxiv.org/pdf/2308.06887v2.pdf","comment":"In NeurIPS 2023. Code: https://github.com/ggaziv/Wormholes Project\n  Webpage: https://himjl.github.io/pwormholes"},{"id":"http://arxiv.org/abs/2310.02906v1","updated":"2023-10-04T15:43:26Z","published":"2023-10-04T15:43:26Z","title":"Boosting Dermatoscopic Lesion Segmentation via Diffusion Models with\n  Visual and Textual Prompts","summary":"  Image synthesis approaches, e.g., generative adversarial networks, have been\npopular as a form of data augmentation in medical image analysis tasks. It is\nprimarily beneficial to overcome the shortage of publicly accessible data and\nassociated quality annotations. However, the current techniques often lack\ncontrol over the detailed contents in generated images, e.g., the type of\ndisease patterns, the location of lesions, and attributes of the diagnosis. In\nthis work, we adapt the latest advance in the generative model, i.e., the\ndiffusion model, with the added control flow using lesion-specific visual and\ntextual prompts for generating dermatoscopic images. We further demonstrate the\nadvantage of our diffusion model-based framework over the classical generation\nmodels in both the image quality and boosting the segmentation performance on\nskin lesions. It can achieve a 9% increase in the SSIM image quality measure\nand an over 5% increase in Dice coefficients over the prior arts.\n","authors":["Shiyi Du","Xiaosong Wang","Yongyi Lu","Yuyin Zhou","Shaoting Zhang","Alan Yuille","Kang Li","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.02906v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.02901v1","updated":"2023-10-04T15:39:57Z","published":"2023-10-04T15:39:57Z","title":"Computationally Efficient Quadratic Neural Networks","summary":"  Higher order artificial neurons whose outputs are computed by applying an\nactivation function to a higher order multinomial function of the inputs have\nbeen considered in the past, but did not gain acceptance due to the extra\nparameters and computational cost. However, higher order neurons have\nsignificantly greater learning capabilities since the decision boundaries of\nhigher order neurons can be complex surfaces instead of just hyperplanes. The\nboundary of a single quadratic neuron can be a general hyper-quadric surface\nallowing it to learn many nonlinearly separable datasets. Since quadratic forms\ncan be represented by symmetric matrices, only $\\frac{n(n+1)}{2}$ additional\nparameters are needed instead of $n^2$. A quadratic Logistic regression model\nis first presented. Solutions to the XOR problem with a single quadratic neuron\nare considered. The complete vectorized equations for both forward and backward\npropagation in feedforward networks composed of quadratic neurons are derived.\nA reduced parameter quadratic neural network model with just $ n $ additional\nparameters per neuron that provides a compromise between learning ability and\ncomputational cost is presented. Comparison on benchmark classification\ndatasets are used to demonstrate that a final layer of quadratic neurons\nenables networks to achieve higher accuracy with significantly fewer hidden\nlayer neurons. In particular this paper shows that any dataset composed of $C$\nbounded clusters can be separated with only a single layer of $C$ quadratic\nneurons.\n","authors":["Mathew Mithra Noel","Venkataraman Muthiah-Nakarajan"],"pdf_url":"https://arxiv.org/pdf/2310.02901v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2310.02894v1","updated":"2023-10-04T15:31:02Z","published":"2023-10-04T15:31:02Z","title":"Human-centric Behavior Description in Videos: New Benchmark and Model","summary":"  In the domain of video surveillance, describing the behavior of each\nindividual within the video is becoming increasingly essential, especially in\ncomplex scenarios with multiple individuals present. This is because describing\neach individual's behavior provides more detailed situational analysis,\nenabling accurate assessment and response to potential risks, ensuring the\nsafety and harmony of public places. Currently, video-level captioning datasets\ncannot provide fine-grained descriptions for each individual's specific\nbehavior. However, mere descriptions at the video-level fail to provide an\nin-depth interpretation of individual behaviors, making it challenging to\naccurately determine the specific identity of each individual. To address this\nchallenge, we construct a human-centric video surveillance captioning dataset,\nwhich provides detailed descriptions of the dynamic behaviors of 7,820\nindividuals. Specifically, we have labeled several aspects of each person, such\nas location, clothing, and interactions with other elements in the scene, and\nthese people are distributed across 1,012 videos. Based on this dataset, we can\nlink individuals to their respective behaviors, allowing for further analysis\nof each person's behavior in surveillance videos. Besides the dataset, we\npropose a novel video captioning approach that can describe individual behavior\nin detail on a person-level basis, achieving state-of-the-art results. To\nfacilitate further research in this field, we intend to release our dataset and\ncode.\n","authors":["Lingru Zhou","Yiqi Gao","Manqing Zhang","Peng Wu","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02887v1","updated":"2023-10-04T15:24:00Z","published":"2023-10-04T15:24:00Z","title":"A Grammatical Compositional Model for Video Action Detection","summary":"  Analysis of human actions in videos demands understanding complex human\ndynamics, as well as the interaction between actors and context. However, these\ninteraction relationships usually exhibit large intra-class variations from\ndiverse human poses or object manipulations, and fine-grained inter-class\ndifferences between similar actions. Thus the performance of existing methods\nis severely limited. Motivated by the observation that interactive actions can\nbe decomposed into actor dynamics and participating objects or humans, we\npropose to investigate the composite property of them. In this paper, we\npresent a novel Grammatical Compositional Model (GCM) for action detection\nbased on typical And-Or graphs. Our model exploits the intrinsic structures and\nlatent relationships of actions in a hierarchical manner to harness both the\ncompositionality of grammar models and the capability of expressing rich\nfeatures of DNNs. The proposed model can be readily embodied into a neural\nnetwork module for efficient optimization in an end-to-end manner. Extensive\nexperiments are conducted on the AVA dataset and the Something-Else task to\ndemonstrate the superiority of our model, meanwhile the interpretability is\nenhanced through an inference parsing procedure.\n","authors":["Zhijun Zhang","Xu Zou","Jiahuan Zhou","Sheng Zhong","Ying Wu"],"pdf_url":"https://arxiv.org/pdf/2310.02887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16364v2","updated":"2023-10-04T14:51:01Z","published":"2023-09-28T12:05:08Z","title":"FG-NeRF: Flow-GAN based Probabilistic Neural Radiance Field for\n  Independence-Assumption-Free Uncertainty Estimation","summary":"  Neural radiance fields with stochasticity have garnered significant interest\nby enabling the sampling of plausible radiance fields and quantifying\nuncertainty for downstream tasks. Existing works rely on the independence\nassumption of points in the radiance field or the pixels in input views to\nobtain tractable forms of the probability density function. However, this\nassumption inadvertently impacts performance when dealing with intricate\ngeometry and texture. In this work, we propose an independence-assumption-free\nprobabilistic neural radiance field based on Flow-GAN. By combining the\ngenerative capability of adversarial learning and the powerful expressivity of\nnormalizing flow, our method explicitly models the density-radiance\ndistribution of the whole scene. We represent our probabilistic NeRF as a\nmean-shifted probabilistic residual neural model. Our model is trained without\nan explicit likelihood function, thereby avoiding the independence assumption.\nSpecifically, We downsample the training images with different strides and\ncenters to form fixed-size patches which are used to train the generator with\npatch-based adversarial learning. Through extensive experiments, our method\ndemonstrates state-of-the-art performance by predicting lower rendering errors\nand more reliable uncertainty on both synthetic and real-world datasets.\n","authors":["Songlin Wei","Jiazhao Zhang","Yang Wang","Fanbo Xiang","Hao Su","He Wang"],"pdf_url":"https://arxiv.org/pdf/2309.16364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02855v1","updated":"2023-10-04T14:42:45Z","published":"2023-10-04T14:42:45Z","title":"Multi-Resolution Fusion for Fully Automatic Cephalometric Landmark\n  Detection","summary":"  Cephalometric landmark detection on lateral skull X-ray images plays a\ncrucial role in the diagnosis of certain dental diseases. Accurate and\neffective identification of these landmarks presents a significant challenge.\nBased on extensive data observations and quantitative analyses, we discovered\nthat visual features from different receptive fields affect the detection\naccuracy of various landmarks differently. As a result, we employed an image\npyramid structure, integrating multiple resolutions as input to train a series\nof models with different receptive fields, aiming to achieve the optimal\nfeature combination for each landmark. Moreover, we applied several data\naugmentation techniques during training to enhance the model's robustness\nacross various devices and measurement alternatives. We implemented this method\nin the Cephalometric Landmark Detection in Lateral X-ray Images 2023 Challenge\nand achieved a Mean Radial Error (MRE) of 1.62 mm and a Success Detection Rate\n(SDR) 2.0mm of 74.18% in the final testing phase.\n","authors":["Dongqian Guo","Wencheng Han"],"pdf_url":"https://arxiv.org/pdf/2310.02855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02848v1","updated":"2023-10-04T14:34:11Z","published":"2023-10-04T14:34:11Z","title":"Magicremover: Tuning-free Text-guided Image inpainting with Diffusion\n  Models","summary":"  Image inpainting aims to fill in the missing pixels with visually coherent\nand semantically plausible content. Despite the great progress brought from\ndeep generative models, this task still suffers from i. the difficulties in\nlarge-scale realistic data collection and costly model training; and ii. the\nintrinsic limitations in the traditionally user-defined binary masks on objects\nwith unclear boundaries or transparent texture. In this paper, we propose\nMagicRemover, a tuning-free method that leverages the powerful diffusion models\nfor text-guided image inpainting. We introduce an attention guidance strategy\nto constrain the sampling process of diffusion models, enabling the erasing of\ninstructed areas and the restoration of occluded content. We further propose a\nclassifier optimization algorithm to facilitate the denoising stability within\nless sampling steps. Extensive comparisons are conducted among our MagicRemover\nand state-of-the-art methods including quantitative evaluation and user study,\ndemonstrating the significant improvement of MagicRemover on high-quality image\ninpainting. We will release our code at https://github.com/exisas/Magicremover.\n","authors":["Siyuan Yang","Lu Zhang","Liqian Ma","Yu Liu","JingJing Fu","You He"],"pdf_url":"https://arxiv.org/pdf/2310.02848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10650v2","updated":"2023-10-04T14:24:09Z","published":"2023-09-19T14:30:14Z","title":"MUSTANG: Multi-Stain Self-Attention Graph Multiple Instance Learning\n  Pipeline for Histopathology Whole Slide Images","summary":"  Whole Slide Images (WSIs) present a challenging computer vision task due to\ntheir gigapixel size and presence of numerous artefacts. Yet they are a\nvaluable resource for patient diagnosis and stratification, often representing\nthe gold standard for diagnostic tasks. Real-world clinical datasets tend to\ncome as sets of heterogeneous WSIs with labels present at the patient-level,\nwith poor to no annotations. Weakly supervised attention-based multiple\ninstance learning approaches have been developed in recent years to address\nthese challenges, but can fail to resolve both long and short-range\ndependencies. Here we propose an end-to-end multi-stain self-attention graph\n(MUSTANG) multiple instance learning pipeline, which is designed to solve a\nweakly-supervised gigapixel multi-image classification task, where the label is\nassigned at the patient-level, but no slide-level labels or region annotations\nare available. The pipeline uses a self-attention based approach by restricting\nthe operations to a highly sparse k-Nearest Neighbour Graph of embedded WSI\npatches based on the Euclidean distance. We show this approach achieves a\nstate-of-the-art F1-score/AUC of 0.89/0.92, outperforming the widely used CLAM\nmodel. Our approach is highly modular and can easily be modified to suit\ndifferent clinical datasets, as it only requires a patient-level label without\nannotations and accepts WSI sets of different sizes, as the graphs can be of\nvarying sizes and structures. The source code can be found at\nhttps://github.com/AmayaGS/MUSTANG.\n","authors":["Amaya Gallagher-Syed","Luca Rossi","Felice Rivellese","Costantino Pitzalis","Myles Lewis","Michael Barnes","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2309.10650v2.pdf","comment":"Accepted for publication at BMVC 2023"},{"id":"http://arxiv.org/abs/2310.02835v1","updated":"2023-10-04T14:01:55Z","published":"2023-10-04T14:01:55Z","title":"Delving into CLIP latent space for Video Anomaly Recognition","summary":"  We tackle the complex problem of detecting and recognising anomalies in\nsurveillance videos at the frame level, utilising only video-level supervision.\nWe introduce the novel method AnomalyCLIP, the first to combine Large Language\nand Vision (LLV) models, such as CLIP, with multiple instance learning for\njoint video anomaly detection and classification. Our approach specifically\ninvolves manipulating the latent CLIP feature space to identify the normal\nevent subspace, which in turn allows us to effectively learn text-driven\ndirections for abnormal events. When anomalous frames are projected onto these\ndirections, they exhibit a large feature magnitude if they belong to a\nparticular class. We also introduce a computationally efficient Transformer\narchitecture to model short- and long-term temporal dependencies between\nframes, ultimately producing the final anomaly score and class prediction\nprobabilities. We compare AnomalyCLIP against state-of-the-art methods\nconsidering three major anomaly detection benchmarks, i.e. ShanghaiTech,\nUCF-Crime, and XD-Violence, and empirically show that it outperforms baselines\nin recognising video anomalies.\n","authors":["Luca Zanella","Benedetta Liberatori","Willi Menapace","Fabio Poiesi","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2310.02835v1.pdf","comment":"submitted to Computer Vision and Image Understanding, project website\n  and code are available at https://luca-zanella-dvl.github.io/AnomalyCLIP/"},{"id":"http://arxiv.org/abs/2310.02829v1","updated":"2023-10-04T13:56:32Z","published":"2023-10-04T13:56:32Z","title":"All Sizes Matter: Improving Volumetric Brain Segmentation on Small\n  Lesions","summary":"  Brain metastases (BMs) are the most frequently occurring brain tumors. The\ntreatment of patients having multiple BMs with stereo tactic radiosurgery\nnecessitates accurate localization of the metastases. Neural networks can\nassist in this time-consuming and costly task that is typically performed by\nhuman experts. Particularly challenging is the detection of small lesions since\nthey are often underrepresented in exist ing approaches. Yet, lesion detection\nis equally important for all sizes. In this work, we develop an ensemble of\nneural networks explicitly fo cused on detecting and segmenting small BMs. To\naccomplish this task, we trained several neural networks focusing on individual\naspects of the BM segmentation problem: We use blob loss that specifically\naddresses the imbalance of lesion instances in terms of size and texture and\nis, therefore, not biased towards larger lesions. In addition, a model using a\nsubtraction sequence between the T1 and T1 contrast-enhanced sequence focuses\non low-contrast lesions. Furthermore, we train additional models only on small\nlesions. Our experiments demonstrate the utility of the ad ditional blob loss\nand the subtraction sequence. However, including the specialized small lesion\nmodels in the ensemble deteriorates segmentation results. We also find\ndomain-knowledge-inspired postprocessing steps to drastically increase our\nperformance in most experiments. Our approach enables us to submit a\ncompetitive challenge entry to the ASNR-MICCAI BraTS Brain Metastasis Challenge\n2023.\n","authors":["Ayhan Can Erdur","Daniel Scholz","Josef A. Buchner","Stephanie E. Combs","Daniel Rueckert","Jan C. Peeken"],"pdf_url":"https://arxiv.org/pdf/2310.02829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02821v1","updated":"2023-10-04T13:44:56Z","published":"2023-10-04T13:44:56Z","title":"Improving Vision Anomaly Detection with the Guidance of Language\n  Modality","summary":"  Recent years have seen a surge of interest in anomaly detection for tackling\nindustrial defect detection, event detection, etc. However, existing\nunsupervised anomaly detectors, particularly those for the vision modality,\nface significant challenges due to redundant information and sparse latent\nspace. Conversely, the language modality performs well due to its relatively\nsingle data. This paper tackles the aforementioned challenges for vision\nmodality from a multimodal point of view. Specifically, we propose Cross-modal\nGuidance (CMG), which consists of Cross-modal Entropy Reduction (CMER) and\nCross-modal Linear Embedding (CMLE), to tackle the redundant information issue\nand sparse space issue, respectively. CMER masks parts of the raw image and\ncomputes the matching score with the text. Then, CMER discards irrelevant\npixels to make the detector focus on critical contents. To learn a more compact\nlatent space for the vision anomaly detector, CMLE learns a correlation\nstructure matrix from the language modality, and then the latent space of\nvision modality will be learned with the guidance of the matrix. Thereafter,\nthe vision latent space will get semantically similar images closer. Extensive\nexperiments demonstrate the effectiveness of the proposed methods.\nParticularly, CMG outperforms the baseline that only uses images by 16.81%.\nAblation experiments further confirm the synergy among the proposed methods, as\neach component depends on the other to achieve optimal performance.\n","authors":["Dong Chen","Kaihang Pan","Guoming Wang","Yueting Zhuang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02821v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2310.02815v1","updated":"2023-10-04T13:38:53Z","published":"2023-10-04T13:38:53Z","title":"CoBEV: Elevating Roadside 3D Object Detection with Depth and Height\n  Complementarity","summary":"  Roadside camera-driven 3D object detection is a crucial task in intelligent\ntransportation systems, which extends the perception range beyond the\nlimitations of vision-centric vehicles and enhances road safety. While previous\nstudies have limitations in using only depth or height information, we find\nboth depth and height matter and they are in fact complementary. The depth\nfeature encompasses precise geometric cues, whereas the height feature is\nprimarily focused on distinguishing between various categories of height\nintervals, essentially providing semantic context. This insight motivates the\ndevelopment of Complementary-BEV (CoBEV), a novel end-to-end monocular 3D\nobject detection framework that integrates depth and height to construct robust\nBEV representations. In essence, CoBEV estimates each pixel's depth and height\ndistribution and lifts the camera features into 3D space for lateral fusion\nusing the newly proposed two-stage complementary feature selection (CFS)\nmodule. A BEV feature distillation framework is also seamlessly integrated to\nfurther enhance the detection accuracy from the prior knowledge of the\nfusion-modal CoBEV teacher. We conduct extensive experiments on the public 3D\ndetection benchmarks of roadside camera-based DAIR-V2X-I and Rope3D, as well as\nthe private Supremind-Road dataset, demonstrating that CoBEV not only achieves\nthe accuracy of the new state-of-the-art, but also significantly advances the\nrobustness of previous methods in challenging long-distance scenarios and noisy\ncamera disturbance, and enhances generalization by a large margin in\nheterologous settings with drastic changes in scene and camera parameters. For\nthe first time, the vehicle AP score of a camera model reaches 80% on\nDAIR-V2X-I in terms of easy mode. The source code will be made publicly\navailable at https://github.com/MasterHow/CoBEV.\n","authors":["Hao Shi","Chengshan Pang","Jiaming Zhang","Kailun Yang","Yuhao Wu","Huajian Ni","Yining Lin","Rainer Stiefelhagen","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02815v1.pdf","comment":"The source code will be made publicly available at\n  https://github.com/MasterHow/CoBEV"},{"id":"http://arxiv.org/abs/2211.13976v5","updated":"2023-10-04T13:37:58Z","published":"2022-11-25T09:38:22Z","title":"Expanding Small-Scale Datasets with Guided Imagination","summary":"  The power of DNNs relies heavily on the quantity and quality of training\ndata. However, collecting and annotating data on a large scale is often\nexpensive and time-consuming. To address this issue, we explore a new task,\ntermed dataset expansion, aimed at expanding a ready-to-use small dataset by\nautomatically creating new labeled samples. To this end, we present a Guided\nImagination Framework (GIF) that leverages cutting-edge generative models like\nDALL-E2 and Stable Diffusion (SD) to \"imagine\" and create informative new data\nfrom the input seed data. Specifically, GIF conducts data imagination by\noptimizing the latent features of the seed data in the semantically meaningful\nspace of the prior model, resulting in the creation of photo-realistic images\nwith new content. To guide the imagination towards creating informative samples\nfor model training, we introduce two key criteria, i.e., class-maintained\ninformation boosting and sample diversity promotion. These criteria are\nverified to be essential for effective dataset expansion: GIF-SD obtains 13.5%\nhigher model accuracy on natural image datasets than unguided expansion with\nSD. With these essential criteria, GIF successfully expands small datasets in\nvarious scenarios, boosting model accuracy by 36.9% on average over six natural\nimage datasets and by 13.5% on average over three medical datasets. The source\ncode is available at https://github.com/Vanint/DatasetExpansion.\n","authors":["Yifan Zhang","Daquan Zhou","Bryan Hooi","Kai Wang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2211.13976v5.pdf","comment":"NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion"},{"id":"http://arxiv.org/abs/2303.13278v2","updated":"2023-10-04T13:35:36Z","published":"2023-03-23T13:59:57Z","title":"Improved Anisotropic Gaussian Filters","summary":"  Elongated anisotropic Gaussian filters are used for the orientation\nestimation of fibers. In cases where computed tomography images are noisy,\nroughly resolved, and of low contrast, they are the method of choice even if\nbeing efficient only in virtual 2D slices. However, minor inaccuracies in the\nanisotropic Gaussian filters can carry over to the orientation estimation.\nTherefore, this paper proposes a modified algorithm for 2D anisotropic Gaussian\nfilters and shows that this improves their precision. Applied to synthetic\nimages of fiber bundles, it is more accurate and robust to noise. Finally, the\neffectiveness of the approach is shown by applying it to real-world images of\nsheet molding compounds.\n","authors":["Alex Keilmann","Michael Godehardt","Ali Moghiseh","Claudia Redenbach","Katja Schladitz"],"pdf_url":"https://arxiv.org/pdf/2303.13278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02804v1","updated":"2023-10-04T13:29:47Z","published":"2023-10-04T13:29:47Z","title":"DOMINO: A Dual-System for Multi-step Visual Language Reasoning","summary":"  Visual language reasoning requires a system to extract text or numbers from\ninformation-dense images like charts or plots and perform logical or arithmetic\nreasoning to arrive at an answer. To tackle this task, existing work relies on\neither (1) an end-to-end vision-language model trained on a large amount of\ndata, or (2) a two-stage pipeline where a captioning model converts the image\ninto text that is further read by another large language model to deduce the\nanswer. However, the former approach forces the model to answer a complex\nquestion with one single step, and the latter approach is prone to inaccurate\nor distracting information in the converted text that can confuse the language\nmodel. In this work, we propose a dual-system for multi-step multimodal\nreasoning, which consists of a \"System-1\" step for visual information\nextraction and a \"System-2\" step for deliberate reasoning. Given an input,\nSystem-2 breaks down the question into atomic sub-steps, each guiding System-1\nto extract the information required for reasoning from the image. Experiments\non chart and plot datasets show that our method with a pre-trained System-2\nmodule performs competitively compared to prior work on in- and\nout-of-distribution data. By fine-tuning the System-2 module (LLaMA-2 70B) on\nonly a small amount of data on multi-step reasoning, the accuracy of our method\nis further improved and surpasses the best fully-supervised end-to-end approach\nby 5.7% and a pipeline approach with FlanPaLM (540B) by 7.5% on a challenging\ndataset with human-authored questions.\n","authors":["Peifang Wang","Olga Golovneva","Armen Aghajanyan","Xiang Ren","Muhao Chen","Asli Celikyilmaz","Maryam Fazel-Zarandi"],"pdf_url":"https://arxiv.org/pdf/2310.02804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.00510v4","updated":"2023-10-04T13:26:14Z","published":"2021-12-01T14:13:11Z","title":"Trimap-guided Feature Mining and Fusion Network for Natural Image\n  Matting","summary":"  Utilizing trimap guidance and fusing multi-level features are two important\nissues for trimap-based matting with pixel-level prediction. To utilize trimap\nguidance, most existing approaches simply concatenate trimaps and images\ntogether to feed a deep network or apply an extra network to extract more\ntrimap guidance, which meets the conflict between efficiency and effectiveness.\nFor emerging content-based feature fusion, most existing matting methods only\nfocus on local features which lack the guidance of a global feature with strong\nsemantic information related to the interesting object. In this paper, we\npropose a trimap-guided feature mining and fusion network consisting of our\ntrimap-guided non-background multi-scale pooling (TMP) module and global-local\ncontext-aware fusion (GLF) modules. Considering that trimap provides strong\nsemantic guidance, our TMP module focuses effective feature mining on\ninteresting objects under the guidance of trimap without extra parameters.\nFurthermore, our GLF modules use global semantic information of interesting\nobjects mined by our TMP module to guide an effective global-local\ncontext-aware multi-level feature fusion. In addition, we build a common\ninteresting object matting (CIOM) dataset to advance high-quality image\nmatting. Particularly, results on the Composition-1k and our CIOM show that our\nTMFNet achieves 13% and 25% relative improvement on SAD, respectively, against\na strong baseline with fewer parameters and 14% fewer FLOPs. Experimental\nresults on the Composition-1k test set, Alphamatting benchmark, and our CIOM\ntest set demonstrate that our method outperforms state-of-the-art approaches.\nOur code and models are available at\nhttps://github.com/Serge-weihao/TMF-Matting.\n","authors":["Weihao Jiang","Dongdong Yu","Zhaozhi Xie","Yaoyi Li","Zehuan Yuan","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2112.00510v4.pdf","comment":"Accepted to Computer Vision and Image Understanding"},{"id":"http://arxiv.org/abs/2309.16264v2","updated":"2023-10-04T13:16:25Z","published":"2023-09-28T08:57:14Z","title":"GAMMA: Generalizable Articulation Modeling and Manipulation for\n  Articulated Objects","summary":"  Articulated objects like cabinets and doors are widespread in daily life.\nHowever, directly manipulating 3D articulated objects is challenging because\nthey have diverse geometrical shapes, semantic categories, and kinetic\nconstraints. Prior works mostly focused on recognizing and manipulating\narticulated objects with specific joint types. They can either estimate the\njoint parameters or distinguish suitable grasp poses to facilitate trajectory\nplanning. Although these approaches have succeeded in certain types of\narticulated objects, they lack generalizability to unseen objects, which\nsignificantly impedes their application in broader scenarios. In this paper, we\npropose a novel framework of Generalizable Articulation Modeling and\nManipulating for Articulated Objects (GAMMA), which learns both articulation\nmodeling and grasp pose affordance from diverse articulated objects with\ndifferent categories. In addition, GAMMA adopts adaptive manipulation to\niteratively reduce the modeling errors and enhance manipulation performance. We\ntrain GAMMA with the PartNet-Mobility dataset and evaluate with comprehensive\nexperiments in SAPIEN simulation and real-world Franka robot. Results show that\nGAMMA significantly outperforms SOTA articulation modeling and manipulation\nalgorithms in unseen and cross-category articulated objects. We will\nopen-source all codes and datasets in both simulation and real robots for\nreproduction in the final version. Images and videos are published on the\nproject website at: http://sites.google.com/view/gamma-articulation\n","authors":["Qiaojun Yu","Junbo Wang","Wenhai Liu","Ce Hao","Liu Liu","Lin Shao","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2309.16264v2.pdf","comment":"8 pages, 5 figures, submitted to ICRA 2024"},{"id":"http://arxiv.org/abs/2310.02792v1","updated":"2023-10-04T13:11:20Z","published":"2023-10-04T13:11:20Z","title":"Tracking Anything in Heart All at Once","summary":"  Myocardial motion tracking stands as an essential clinical tool in the\nprevention and detection of Cardiovascular Diseases (CVDs), the foremost cause\nof death globally. However, current techniques suffer incomplete and inaccurate\nmotion estimation of the myocardium both in spatial and temporal dimensions,\nhindering the early identification of myocardial dysfunction. In addressing\nthese challenges, this paper introduces the Neural Cardiac Motion Field\n(NeuralCMF). NeuralCMF leverages the implicit neural representation (INR) to\nmodel the 3D structure and the comprehensive 6D forward/backward motion of the\nheart. This approach offers memory-efficient storage and continuous capability\nto query the precise shape and motion of the myocardium throughout the cardiac\ncycle at any specific point. Notably, NeuralCMF operates without the need for\npaired datasets, and its optimization is self-supervised through the physics\nknowledge priors both in space and time dimensions, ensuring compatibility with\nboth 2D and 3D echocardiogram video inputs. Experimental validations across\nthree representative datasets support the robustness and innovative nature of\nthe NeuralCMF, marking significant advantages over existing state-of-the-arts\nin cardiac imaging and motion tracking.\n","authors":["Chengkang Shen","Hao Zhu","You Zhou","Yu Liu","Si Yi","Lili Dong","Weipeng Zhao","David J. Brady","Xun Cao","Zhan Ma","Yi Lin"],"pdf_url":"https://arxiv.org/pdf/2310.02792v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.02781v1","updated":"2023-10-04T12:52:38Z","published":"2023-10-04T12:52:38Z","title":"LROC-PANGU-GAN: Closing the Simulation Gap in Learning Crater\n  Segmentation with Planetary Simulators","summary":"  It is critical for probes landing on foreign planetary bodies to be able to\nrobustly identify and avoid hazards - as, for example, steep cliffs or deep\ncraters can pose significant risks to a probe's landing and operational\nsuccess. Recent applications of deep learning to this problem show promising\nresults. These models are, however, often learned with explicit supervision\nover annotated datasets. These human-labelled crater databases, such as from\nthe Lunar Reconnaissance Orbiter Camera (LROC), may lack in consistency and\nquality, undermining model performance - as incomplete and/or inaccurate labels\nintroduce noise into the supervisory signal, which encourages the model to\nlearn incorrect associations and results in the model making unreliable\npredictions. Physics-based simulators, such as the Planet and Asteroid Natural\nScene Generation Utility, have, in contrast, perfect ground truth, as the\ninternal state that they use to render scenes is known with exactness. However,\nthey introduce a serious simulation-to-real domain gap - because of fundamental\ndifferences between the simulated environment and the real-world arising from\nmodelling assumptions, unaccounted for physical interactions, environmental\nvariability, etc. Therefore, models trained on their outputs suffer when\ndeployed in the face of realism they have not encountered in their training\ndata distributions. In this paper, we therefore introduce a system to close\nthis \"realism\" gap while retaining label fidelity. We train a CycleGAN model to\nsynthesise LROC from Planet and Asteroid Natural Scene Generation Utility\n(PANGU) images. We show that these improve the training of a downstream crater\nsegmentation network, with segmentation performance on a test set of real LROC\nimages improved as compared to using only simulated PANGU images.\n","authors":["Jaewon La","Jaime Phadke","Matt Hutton","Marius Schwinning","Gabriele De Canio","Florian Renk","Lars Kunze","Matthew Gadd"],"pdf_url":"https://arxiv.org/pdf/2310.02781v1.pdf","comment":"17th Symposium on Advanced Space Technologies in Robotics and\n  Automation"},{"id":"http://arxiv.org/abs/2310.02776v1","updated":"2023-10-04T12:47:48Z","published":"2023-10-04T12:47:48Z","title":"Dynamic Shuffle: An Efficient Channel Mixture Method","summary":"  The redundancy of Convolutional neural networks not only depends on weights\nbut also depends on inputs. Shuffling is an efficient operation for mixing\nchannel information but the shuffle order is usually pre-defined. To reduce the\ndata-dependent redundancy, we devise a dynamic shuffle module to generate\ndata-dependent permutation matrices for shuffling. Since the dimension of\npermutation matrix is proportional to the square of the number of input\nchannels, to make the generation process efficiently, we divide the channels\ninto groups and generate two shared small permutation matrices for each group,\nand utilize Kronecker product and cross group shuffle to obtain the final\npermutation matrices. To make the generation process learnable, based on\ntheoretical analysis, softmax, orthogonal regularization, and binarization are\nemployed to asymptotically approximate the permutation matrix. Dynamic shuffle\nadaptively mixes channel information with negligible extra computation and\nmemory occupancy. Experiment results on image classification benchmark datasets\nCIFAR-10, CIFAR-100, Tiny ImageNet and ImageNet have shown that our method\nsignificantly increases ShuffleNets' performance. Adding dynamic generated\nmatrix with learnable static matrix, we further propose static-dynamic-shuffle\nand show that it can serve as a lightweight replacement of ordinary pointwise\nconvolution.\n","authors":["Kaijun Gong","Zhuowen Yin","Yushu Li","Kailing Guo","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15842v2","updated":"2023-10-04T12:37:00Z","published":"2023-05-25T08:32:41Z","title":"Text-to-Motion Retrieval: Towards Joint Understanding of Human Motion\n  Data and Natural Language","summary":"  Due to recent advances in pose-estimation methods, human motion can be\nextracted from a common video in the form of 3D skeleton sequences. Despite\nwonderful application opportunities, effective and efficient content-based\naccess to large volumes of such spatio-temporal skeleton data still remains a\nchallenging problem. In this paper, we propose a novel content-based\ntext-to-motion retrieval task, which aims at retrieving relevant motions based\non a specified natural-language textual description. To define baselines for\nthis uncharted task, we employ the BERT and CLIP language representations to\nencode the text modality and successful spatio-temporal models to encode the\nmotion modality. We additionally introduce our transformer-based approach,\ncalled Motion Transformer (MoT), which employs divided space-time attention to\neffectively aggregate the different skeleton joints in space and time. Inspired\nby the recent progress in text-to-image/video matching, we experiment with two\nwidely-adopted metric-learning loss functions. Finally, we set up a common\nevaluation protocol by defining qualitative metrics for assessing the quality\nof the retrieved motions, targeting the two recently-introduced KIT\nMotion-Language and HumanML3D datasets. The code for reproducing our results is\navailable at https://github.com/mesnico/text-to-motion-retrieval.\n","authors":["Nicola Messina","Jan Sedmidubsky","Fabrizio Falchi","Tomáš Rebok"],"pdf_url":"https://arxiv.org/pdf/2305.15842v2.pdf","comment":"SIGIR 2023 (best short paper honorable mention)"},{"id":"http://arxiv.org/abs/2309.13438v2","updated":"2023-10-04T12:13:53Z","published":"2023-09-23T17:29:38Z","title":"Rethinking superpixel segmentation from biologically inspired mechanisms","summary":"  Recently, advancements in deep learning-based superpixel segmentation methods\nhave brought about improvements in both the efficiency and the performance of\nsegmentation. However, a significant challenge remains in generating\nsuperpixels that strictly adhere to object boundaries while conveying rich\nvisual significance, especially when cross-surface color correlations may\ninterfere with objects. Drawing inspiration from neural structure and visual\nmechanisms, we propose a biological network architecture comprising an Enhanced\nScreening Module (ESM) and a novel Boundary-Aware Label (BAL) for superpixel\nsegmentation. The ESM enhances semantic information by simulating the\ninteractive projection mechanisms of the visual cortex. Additionally, the BAL\nemulates the spatial frequency characteristics of visual cortical cells to\nfacilitate the generation of superpixels with strong boundary adherence. We\ndemonstrate the effectiveness of our approach through evaluations on both the\nBSDS500 dataset and the NYUv2 dataset.\n","authors":["TingYu Zhao","Bo Peng","Yuan Sun","DaiPeng Yang","ZhenGuang Zhange","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2309.13438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15521v2","updated":"2023-10-04T11:54:08Z","published":"2023-09-27T09:39:45Z","title":"MLOps for Scarce Image Data: A Use Case in Microscopic Image Analysis","summary":"  Nowadays, Machine Learning (ML) is experiencing tremendous popularity that\nhas never been seen before. The operationalization of ML models is governed by\na set of concepts and methods referred to as Machine Learning Operations\n(MLOps). Nevertheless, researchers, as well as professionals, often focus more\non the automation aspect and neglect the continuous deployment and monitoring\naspects of MLOps. As a result, there is a lack of continuous learning through\nthe flow of feedback from production to development, causing unexpected model\ndeterioration over time due to concept drifts, particularly when dealing with\nscarce data. This work explores the complete application of MLOps in the\ncontext of scarce data analysis. The paper proposes a new holistic approach to\nenhance biomedical image analysis. Our method includes: a fingerprinting\nprocess that enables selecting the best models, datasets, and model development\nstrategy relative to the image analysis task at hand; an automated model\ndevelopment stage; and a continuous deployment and monitoring process to ensure\ncontinuous learning. For preliminary results, we perform a proof of concept for\nfingerprinting in microscopic image datasets.\n","authors":["Angelo Yamachui Sitcheu","Nils Friederich","Simon Baeuerle","Oliver Neumann","Markus Reischl","Ralf Mikut"],"pdf_url":"https://arxiv.org/pdf/2309.15521v2.pdf","comment":"21 pages, 5 figures , 33. Workshop on Computational Intelligence\n  Berlin Germany"},{"id":"http://arxiv.org/abs/2310.02753v1","updated":"2023-10-04T11:44:20Z","published":"2023-10-04T11:44:20Z","title":"MUNCH: Modelling Unique 'N Controllable Heads","summary":"  The automated generation of 3D human heads has been an intriguing and\nchallenging task for computer vision researchers. Prevailing methods synthesize\nrealistic avatars but with limited control over the diversity and quality of\nrendered outputs and suffer from limited correlation between shape and texture\nof the character. We propose a method that offers quality, diversity, control,\nand realism along with explainable network design, all desirable features to\ngame-design artists in the domain. First, our proposed Geometry Generator\nidentifies disentangled latent directions and generate novel and diverse\nsamples. A Render Map Generator then learns to synthesize multiply high-fidelty\nphysically-based render maps including Albedo, Glossiness, Specular, and\nNormals. For artists preferring fine-grained control over the output, we\nintroduce a novel Color Transformer Model that allows semantic color control\nover generated maps. We also introduce quantifiable metrics called Uniqueness\nand Novelty and a combined metric to test the overall performance of our model.\nDemo for both shapes and textures can be found:\nhttps://munch-seven.vercel.app/. We will release our model along with the\nsynthetic dataset.\n","authors":["Debayan Deb","Suvidha Tripathi","Pranit Puri"],"pdf_url":"https://arxiv.org/pdf/2310.02753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02751v1","updated":"2023-10-04T11:43:08Z","published":"2023-10-04T11:43:08Z","title":"SHOT: Suppressing the Hessian along the Optimization Trajectory for\n  Gradient-Based Meta-Learning","summary":"  In this paper, we hypothesize that gradient-based meta-learning (GBML)\nimplicitly suppresses the Hessian along the optimization trajectory in the\ninner loop. Based on this hypothesis, we introduce an algorithm called SHOT\n(Suppressing the Hessian along the Optimization Trajectory) that minimizes the\ndistance between the parameters of the target and reference models to suppress\nthe Hessian in the inner loop. Despite dealing with high-order terms, SHOT does\nnot increase the computational complexity of the baseline model much. It is\nagnostic to both the algorithm and architecture used in GBML, making it highly\nversatile and applicable to any GBML baseline. To validate the effectiveness of\nSHOT, we conduct empirical tests on standard few-shot learning tasks and\nqualitatively analyze its dynamics. We confirm our hypothesis empirically and\ndemonstrate that SHOT outperforms the corresponding baseline. Code is available\nat: https://github.com/JunHoo-Lee/SHOT\n","authors":["JunHoo Lee","Jayeon Yoo","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2310.02751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12114v2","updated":"2023-10-04T10:50:42Z","published":"2023-09-21T14:34:17Z","title":"AutoPET Challenge 2023: Sliding Window-based Optimization of U-Net","summary":"  Tumor segmentation in medical imaging is crucial and relies on precise\ndelineation. Fluorodeoxyglucose Positron-Emission Tomography (FDG-PET) is\nwidely used in clinical practice to detect metabolically active tumors.\nHowever, FDG-PET scans may misinterpret irregular glucose consumption in\nhealthy or benign tissues as cancer. Combining PET with Computed Tomography\n(CT) can enhance tumor segmentation by integrating metabolic and anatomic\ninformation. FDG-PET/CT scans are pivotal for cancer staging and reassessment,\nutilizing radiolabeled fluorodeoxyglucose to highlight metabolically active\nregions. Accurately distinguishing tumor-specific uptake from physiological\nuptake in normal tissues is a challenging aspect of precise tumor segmentation.\nThe AutoPET challenge addresses this by providing a dataset of 1014 FDG-PET/CT\nstudies, encouraging advancements in accurate tumor segmentation and analysis\nwithin the FDG-PET/CT domain. Code:\nhttps://github.com/matt3o/AutoPET2-Submission/\n","authors":["Matthias Hadlich","Zdravko Marinov","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2309.12114v2.pdf","comment":"9 pages, 1 figure, MICCAI 2023 - AutoPET Challenge Submission Version\n  2: Added all results on the preliminary test set"},{"id":"http://arxiv.org/abs/2310.02719v1","updated":"2023-10-04T10:45:55Z","published":"2023-10-04T10:45:55Z","title":"Condition numbers in multiview geometry, instability in relative pose\n  estimation, and RANSAC","summary":"  In this paper we introduce a general framework for analyzing the numerical\nconditioning of minimal problems in multiple view geometry, using tools from\ncomputational algebra and Riemannian geometry. Special motivation comes from\nthe fact that relative pose estimation, based on standard 5-point or 7-point\nRandom Sample Consensus (RANSAC) algorithms, can fail even when no outliers are\npresent and there is enough data to support a hypothesis. We argue that these\ncases arise due to the intrinsic instability of the 5- and 7-point minimal\nproblems. We apply our framework to characterize the instabilities, both in\nterms of the world scenes that lead to infinite condition number, and directly\nin terms of ill-conditioned image data. The approach produces computational\ntests for assessing the condition number before solving the minimal problem.\nLastly synthetic and real data experiments suggest that RANSAC serves not only\nto remove outliers, but also to select for well-conditioned image data, as\npredicted by our theory.\n","authors":["Hongyi Fan","Joe Kileel","Benjamin Kimia"],"pdf_url":"https://arxiv.org/pdf/2310.02719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02718v1","updated":"2023-10-04T10:41:21Z","published":"2023-10-04T10:41:21Z","title":"Understanding Pan-Sharpening via Generalized Inverse","summary":"  Pan-sharpening algorithm utilizes panchromatic image and multispectral image\nto obtain a high spatial and high spectral image. However, the optimizations of\nthe algorithms are designed with different standards. We adopt the simple\nmatrix equation to describe the Pan-sharpening problem. The solution existence\ncondition and the acquirement of spectral and spatial resolution are discussed.\nA down-sampling enhancement method was introduced for better acquiring the\nspatial and spectral down-sample matrices. By the generalized inverse theory,\nwe derived two forms of general inverse matrix formulations that can correspond\nto the two prominent classes of Pan-sharpening methods, that is, component\nsubstitution and multi-resolution analysis methods. Specifically, the Gram\nSchmidt Adaptive(GSA) was proved to follow the general inverse matrix\nformulation of component substitution. A model prior to the general inverse\nmatrix of the spectral function was rendered. The theoretical errors are\nanalyzed. Synthetic experiments and real data experiments are implemented. The\nproposed methods are better and sharper than other methods qualitatively in\nboth synthetic and real experiments. The down-sample enhancement effect is\nshown of better results both quantitatively and qualitatively in real\nexperiments. The generalized inverse matrix theory help us better understand\nthe Pan-sharpening.\n","authors":["Shiqi Liu","Yutong Bai","Xinyang Han","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2310.02718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03531v2","updated":"2023-10-04T10:39:47Z","published":"2023-06-06T09:28:37Z","title":"A Unified Concept-Based System for Local, Global, and Misclassification\n  Explanations","summary":"  Explainability of Deep Neural Networks (DNNs) has been garnering increasing\nattention in recent years. Of the various explainability approaches,\nconcept-based techniques stand out for their ability to utilize\nhuman-meaningful concepts instead of focusing solely on individual pixels.\nHowever, there is a scarcity of methods that consistently provide both local\nand global explanations. Moreover, most of the methods have no offer to explain\nmisclassification cases. Considering these challenges, we present a unified\nconcept-based system for unsupervised learning of both local and global\nconcepts. Our primary objective is to uncover the intrinsic concepts underlying\neach data category by training surrogate explainer networks to estimate the\nimportance of the concepts. Our experimental results substantiated the efficacy\nof the discovered concepts through diverse quantitative and qualitative\nassessments, encompassing faithfulness, completeness, and generality.\nFurthermore, our approach facilitates the explanation of both accurate and\nerroneous predictions, rendering it a valuable tool for comprehending the\ncharacteristics of the target objects and classes.\n","authors":["Fatemeh Aghaeipoor","Dorsa Asgarian","Mohammad Sabokrou"],"pdf_url":"https://arxiv.org/pdf/2306.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04995v3","updated":"2023-10-04T10:39:24Z","published":"2023-03-09T02:38:32Z","title":"Text-Visual Prompting for Efficient 2D Temporal Video Grounding","summary":"  In this paper, we study the problem of temporal video grounding (TVG), which\naims to predict the starting/ending time points of moments described by a text\nsentence within a long untrimmed video. Benefiting from fine-grained 3D visual\nfeatures, the TVG techniques have achieved remarkable progress in recent years.\nHowever, the high complexity of 3D convolutional neural networks (CNNs) makes\nextracting dense 3D visual features time-consuming, which calls for intensive\nmemory and computing resources. Towards efficient TVG, we propose a novel\ntext-visual prompting (TVP) framework, which incorporates optimized\nperturbation patterns (that we call 'prompts') into both visual inputs and\ntextual features of a TVG model. In sharp contrast to 3D CNNs, we show that TVP\nallows us to effectively co-train vision encoder and language encoder in a 2D\nTVG model and improves the performance of crossmodal feature fusion using only\nlow-complexity sparse 2D visual features. Further, we propose a\nTemporal-Distance IoU (TDIoU) loss for efficient learning of TVG. Experiments\non two benchmark datasets, Charades-STA and ActivityNet Captions datasets,\nempirically show that the proposed TVP significantly boosts the performance of\n2D TVG (e.g., 9.79% improvement on Charades-STA and 30.77% improvement on\nActivityNet Captions) and achieves 5x inference acceleration over TVG using 3D\nvisual features. Codes are available at Open.Intel.\n","authors":["Yimeng Zhang","Xin Chen","Jinghan Jia","Sijia Liu","Ke Ding"],"pdf_url":"https://arxiv.org/pdf/2303.04995v3.pdf","comment":"Accepted to the CVPR 2023 and code released\n  (https://github.com/intel/TVP)"},{"id":"http://arxiv.org/abs/2310.02714v1","updated":"2023-10-04T10:30:24Z","published":"2023-10-04T10:30:24Z","title":"GETAvatar: Generative Textured Meshes for Animatable Human Avatars","summary":"  We study the problem of 3D-aware full-body human generation, aiming at\ncreating animatable human avatars with high-quality textures and geometries.\nGenerally, two challenges remain in this field: i) existing methods struggle to\ngenerate geometries with rich realistic details such as the wrinkles of\ngarments; ii) they typically utilize volumetric radiance fields and neural\nrenderers in the synthesis process, making high-resolution rendering\nnon-trivial. To overcome these problems, we propose GETAvatar, a Generative\nmodel that directly generates Explicit Textured 3D meshes for animatable human\nAvatar, with photo-realistic appearance and fine geometric details.\nSpecifically, we first design an articulated 3D human representation with\nexplicit surface modeling, and enrich the generated humans with realistic\nsurface details by learning from the 2D normal maps of 3D scan data. Second,\nwith the explicit mesh representation, we can use a rasterization-based\nrenderer to perform surface rendering, allowing us to achieve high-resolution\nimage generation efficiently. Extensive experiments demonstrate that GETAvatar\nachieves state-of-the-art performance on 3D-aware human generation both in\nappearance and geometry quality. Notably, GETAvatar can generate images at\n512x512 resolution with 17FPS and 1024x1024 resolution with 14FPS, improving\nupon previous methods by 2x. Our code and models will be available.\n","authors":["Xuanmeng Zhang","Jianfeng Zhang","Rohan Chacko","Hongyi Xu","Guoxian Song","Yi Yang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2310.02714v1.pdf","comment":"Accepted by ICCV2023. Project Page: https://getavatar.github.io/"},{"id":"http://arxiv.org/abs/2310.02712v1","updated":"2023-10-04T10:28:38Z","published":"2023-10-04T10:28:38Z","title":"ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space\n  NeRF","summary":"  Recently, there has been a significant advancement in text-to-image diffusion\nmodels, leading to groundbreaking performance in 2D image generation. These\nadvancements have been extended to 3D models, enabling the generation of novel\n3D objects from textual descriptions. This has evolved into NeRF editing\nmethods, which allow the manipulation of existing 3D objects through textual\nconditioning. However, existing NeRF editing techniques have faced limitations\nin their performance due to slow training speeds and the use of loss functions\nthat do not adequately consider editing. To address this, here we present a\nnovel 3D NeRF editing approach dubbed ED-NeRF by successfully embedding\nreal-world scenes into the latent space of the latent diffusion model (LDM)\nthrough a unique refinement layer. This approach enables us to obtain a NeRF\nbackbone that is not only faster but also more amenable to editing compared to\ntraditional image space NeRF editing. Furthermore, we propose an improved loss\nfunction tailored for editing by migrating the delta denoising score (DDS)\ndistillation loss, originally used in 2D image editing to the three-dimensional\ndomain. This novel loss function surpasses the well-known score distillation\nsampling (SDS) loss in terms of suitability for editing purposes. Our\nexperimental results demonstrate that ED-NeRF achieves faster editing speed\nwhile producing improved output quality compared to state-of-the-art 3D editing\nmodels.\n","authors":["Jangho Park","Gihyun Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2310.02712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02692v1","updated":"2023-10-04T10:03:07Z","published":"2023-10-04T10:03:07Z","title":"Bridging the Domain Gap by Clustering-based Image-Text Graph Matching","summary":"  Learning domain-invariant representations is important to train a model that\ncan generalize well to unseen target task domains. Text descriptions inherently\ncontain semantic structures of concepts and such auxiliary semantic cues can be\nused as effective pivot embedding for domain generalization problems. Here, we\nuse multimodal graph representations, fusing images and text, to get\ndomain-invariant pivot embeddings by considering the inherent semantic\nstructure between local images and text descriptors. Specifically, we aim to\nlearn domain-invariant features by (i) representing the image and text\ndescriptions with graphs, and by (ii) clustering and matching the graph-based\nimage node features into textual graphs simultaneously. We experiment with\nlarge-scale public datasets, such as CUB-DG and DomainBed, and our model\nachieves matched or better state-of-the-art performance on these datasets. Our\ncode will be publicly available upon publication.\n","authors":["Nokyung Park","Daewon Chae","Jeongyong Shim","Sangpil Kim","Eun-Sol Kim","Jinkyu Kim"],"pdf_url":"https://arxiv.org/pdf/2310.02692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15410v2","updated":"2023-10-04T10:02:45Z","published":"2023-06-27T12:11:22Z","title":"AutoGraph: Predicting Lane Graphs from Traffic Observations","summary":"  Lane graph estimation is a long-standing problem in the context of autonomous\ndriving. Previous works aimed at solving this problem by relying on\nlarge-scale, hand-annotated lane graphs, introducing a data bottleneck for\ntraining models to solve this task. To overcome this limitation, we propose to\nuse the motion patterns of traffic participants as lane graph annotations. In\nour AutoGraph approach, we employ a pre-trained object tracker to collect the\ntracklets of traffic participants such as vehicles and trucks. Based on the\nlocation of these tracklets, we predict the successor lane graph from an\ninitial position using overhead RGB images only, not requiring any human\nsupervision. In a subsequent stage, we show how the individual successor\npredictions can be aggregated into a consistent lane graph. We demonstrate the\nefficacy of our approach on the UrbanLaneGraph dataset and perform extensive\nquantitative and qualitative evaluations, indicating that AutoGraph is on par\nwith models trained on hand-annotated graph data. Model and dataset will be\nmade available at redacted-for-review.\n","authors":["Jannik Zürn","Ingmar Posner","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2306.15410v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.02690v1","updated":"2023-10-04T10:02:04Z","published":"2023-10-04T10:02:04Z","title":"Multi-Dimension-Embedding-Aware Modality Fusion Transformer for\n  Psychiatric Disorder Clasification","summary":"  Deep learning approaches, together with neuroimaging techniques, play an\nimportant role in psychiatric disorders classification. Previous studies on\npsychiatric disorders diagnosis mainly focus on using functional connectivity\nmatrices of resting-state functional magnetic resonance imaging (rs-fMRI) as\ninput, which still needs to fully utilize the rich temporal information of the\ntime series of rs-fMRI data. In this work, we proposed a\nmulti-dimension-embedding-aware modality fusion transformer (MFFormer) for\nschizophrenia and bipolar disorder classification using rs-fMRI and T1 weighted\nstructural MRI (T1w sMRI). Concretely, to fully utilize the temporal\ninformation of rs-fMRI and spatial information of sMRI, we constructed a deep\nlearning architecture that takes as input 2D time series of rs-fMRI and 3D\nvolumes T1w. Furthermore, to promote intra-modality attention and information\nfusion across different modalities, a fusion transformer module (FTM) is\ndesigned through extensive self-attention of hybrid feature maps of\nmulti-modality. In addition, a dimension-up and dimension-down strategy is\nsuggested to properly align feature maps of multi-dimensional from different\nmodalities. Experimental results on our private and public OpenfMRI datasets\nshow that our proposed MFFormer performs better than that using a single\nmodality or multi-modality MRI on schizophrenia and bipolar disorder diagnosis.\n","authors":["Guoxin Wang","Xuyang Cao","Shan An","Fengmei Fan","Chao Zhang","Jinsong Wang","Feng Yu","Zhiren Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02687v1","updated":"2023-10-04T09:51:58Z","published":"2023-10-04T09:51:58Z","title":"USB-NeRF: Unrolling Shutter Bundle Adjusted Neural Radiance Fields","summary":"  Neural Radiance Fields (NeRF) has received much attention recently due to its\nimpressive capability to represent 3D scene and synthesize novel view images.\nExisting works usually assume that the input images are captured by a global\nshutter camera. Thus, rolling shutter (RS) images cannot be trivially applied\nto an off-the-shelf NeRF algorithm for novel view synthesis. Rolling shutter\neffect would also affect the accuracy of the camera pose estimation (e.g. via\nCOLMAP), which further prevents the success of NeRF algorithm with RS images.\nIn this paper, we propose Unrolling Shutter Bundle Adjusted Neural Radiance\nFields (USB-NeRF). USB-NeRF is able to correct rolling shutter distortions and\nrecover accurate camera motion trajectory simultaneously under the framework of\nNeRF, by modeling the physical image formation process of a RS camera.\nExperimental results demonstrate that USB-NeRF achieves better performance\ncompared to prior works, in terms of RS effect removal, novel view image\nsynthesis as well as camera motion estimation. Furthermore, our algorithm can\nalso be used to recover high-fidelity high frame-rate global shutter video from\na sequence of RS images.\n","authors":["Moyang Li","Peng Wang","Lingzhe Zhao","Bangyan Liao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05613v2","updated":"2023-10-04T09:42:43Z","published":"2023-09-11T16:54:34Z","title":"Learning the Geodesic Embedding with Graph Neural Networks","summary":"  We present GeGnn, a learning-based method for computing the approximate\ngeodesic distance between two arbitrary points on discrete polyhedra surfaces\nwith constant time complexity after fast precomputation. Previous relevant\nmethods either focus on computing the geodesic distance between a single source\nand all destinations, which has linear complexity at least or require a long\nprecomputation time. Our key idea is to train a graph neural network to embed\nan input mesh into a high-dimensional embedding space and compute the geodesic\ndistance between a pair of points using the corresponding embedding vectors and\na lightweight decoding function. To facilitate the learning of the embedding,\nwe propose novel graph convolution and graph pooling modules that incorporate\nlocal geodesic information and are verified to be much more effective than\nprevious designs. After training, our method requires only one forward pass of\nthe network per mesh as precomputation. Then, we can compute the geodesic\ndistance between a pair of points using our decoding function, which requires\nonly several matrix multiplications and can be massively parallelized on GPUs.\nWe verify the efficiency and effectiveness of our method on ShapeNet and\ndemonstrate that our method is faster than existing methods by orders of\nmagnitude while achieving comparable or better accuracy. Additionally, our\nmethod exhibits robustness on noisy and incomplete meshes and strong\ngeneralization ability on out-of-distribution meshes. The code and pretrained\nmodel can be found on https://github.com/IntelligentGeometry/GeGnn.\n","authors":["Bo Pang","Zhongtian Zheng","Guoping Wang","Peng-Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05613v2.pdf","comment":"SIGGRAPH Asia 2023, Journal Track"},{"id":"http://arxiv.org/abs/2211.14118v2","updated":"2023-10-04T09:29:07Z","published":"2022-11-25T14:01:54Z","title":"MS-PS: A Multi-Scale Network for Photometric Stereo With a New\n  Comprehensive Training Dataset","summary":"  The photometric stereo (PS) problem consists in reconstructing the 3D-surface\nof an object, thanks to a set of photographs taken under different lighting\ndirections. In this paper, we propose a multi-scale architecture for PS which,\ncombined with a new dataset, yields state-of-the-art results. Our proposed\narchitecture is flexible: it permits to consider a variable number of images as\nwell as variable image size without loss of performance. In addition, we define\na set of constraints to allow the generation of a relevant synthetic dataset to\ntrain convolutional neural networks for the PS problem. Our proposed dataset is\nmuch larger than pre-existing ones, and contains many objects with challenging\nmaterials having anisotropic reflectance (e.g. metals, glass). We show on\npublicly available benchmarks that the combination of both these contributions\ndrastically improves the accuracy of the estimated normal field, in comparison\nwith previous state-of-the-art methods.\n","authors":["Clément Hardy","Yvain Quéau","David Tschumperlé"],"pdf_url":"https://arxiv.org/pdf/2211.14118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02676v1","updated":"2023-10-04T09:27:39Z","published":"2023-10-04T09:27:39Z","title":"PostRainBench: A comprehensive benchmark and a new model for\n  precipitation forecasting","summary":"  Accurate precipitation forecasting is a vital challenge of both scientific\nand societal importance. Data-driven approaches have emerged as a widely used\nsolution for addressing this challenge. However, solely relying on data-driven\napproaches has limitations in modeling the underlying physics, making accurate\npredictions difficult. Coupling AI-based post-processing techniques with\ntraditional Numerical Weather Prediction (NWP) methods offers a more effective\nsolution for improving forecasting accuracy. Despite previous post-processing\nefforts, accurately predicting heavy rainfall remains challenging due to the\nimbalanced precipitation data across locations and complex relationships\nbetween multiple meteorological variables. To address these limitations, we\nintroduce the PostRainBench, a comprehensive multi-variable NWP post-processing\nbenchmark consisting of three datasets for NWP post-processing-based\nprecipitation forecasting. We propose CAMT, a simple yet effective Channel\nAttention Enhanced Multi-task Learning framework with a specially designed\nweighted loss function. Its flexible design allows for easy plug-and-play\nintegration with various backbones. Extensive experimental results on the\nproposed benchmark show that our method outperforms state-of-the-art methods by\n6.3%, 4.7%, and 26.8% in rain CSI on the three datasets respectively. Most\nnotably, our model is the first deep learning-based method to outperform\ntraditional Numerical Weather Prediction (NWP) approaches in extreme\nprecipitation conditions. It shows improvements of 15.6%, 17.4%, and 31.8% over\nNWP predictions in heavy rain CSI on respective datasets. These results\nhighlight the potential impact of our model in reducing the severe consequences\nof extreme weather events.\n","authors":["Yujin Tang","Jiaming Zhou","Xiang Pan","Zeying Gong","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2310.02676v1.pdf","comment":"16 pages, 3 figures. arXiv admin note: text overlap with\n  arXiv:2105.05537, arXiv:2206.15241 by other authors"},{"id":"http://arxiv.org/abs/2310.02674v1","updated":"2023-10-04T09:26:44Z","published":"2023-10-04T09:26:44Z","title":"Land-cover change detection using paired OpenStreetMap data and optical\n  high-resolution imagery via object-guided Transformer","summary":"  Optical high-resolution imagery and OpenStreetMap (OSM) data are two\nimportant data sources for land-cover change detection. Previous studies in\nthese two data sources focus on utilizing the information in OSM data to aid\nthe change detection on multi-temporal optical high-resolution images. This\npaper pioneers the direct detection of land-cover changes utilizing paired OSM\ndata and optical imagery, thereby broadening the horizons of change detection\ntasks to encompass more dynamic earth observations. To this end, we propose an\nobject-guided Transformer (ObjFormer) architecture by naturally combining the\nprevalent object-based image analysis (OBIA) technique with the advanced vision\nTransformer architecture. The introduction of OBIA can significantly reduce the\ncomputational overhead and memory burden in the self-attention module.\nSpecifically, the proposed ObjFormer has a hierarchical pseudo-siamese encoder\nconsisting of object-guided self-attention modules that extract representative\nfeatures of different levels from OSM data and optical images; a decoder\nconsisting of object-guided cross-attention modules can progressively recover\nthe land-cover changes from the extracted heterogeneous features. In addition\nto the basic supervised binary change detection task, this paper raises a new\nsemi-supervised semantic change detection task that does not require any\nmanually annotated land-cover labels of optical images to train semantic change\ndetectors. Two lightweight semantic decoders are added to ObjFormer to\naccomplish this task efficiently. A converse cross-entropy loss is designed to\nfully utilize the negative samples, thereby contributing to the great\nperformance improvement in this task. The first large-scale benchmark dataset\ncontaining 1,287 map-image pairs (1024$\\times$ 1024 pixels for each sample)\ncovering 40 regions on six continents ...(see the manuscript for the full\nabstract)\n","authors":["Hongruixuan Chen","Cuiling Lan","Jian Song","Clifford Broni-Bediako","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2310.02674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05727v2","updated":"2023-10-04T09:11:39Z","published":"2023-04-12T09:34:13Z","title":"Preemptively Pruning Clever-Hans Strategies in Deep Neural Networks","summary":"  Explainable AI has become a popular tool for validating machine learning\nmodels. Mismatches between the explained model's decision strategy and the\nuser's domain knowledge (e.g. Clever Hans effects) have also been recognized as\na starting point for improving faulty models. However, it is less clear what to\ndo when the user and the explanation agree. In this paper, we demonstrate that\nacceptance of explanations by the user is not a guarantee for a machine\nlearning model to function well, in particular, some Clever Hans effects may\nremain undetected. Such hidden flaws of the model can nevertheless be\nmitigated, and we demonstrate this by contributing a new method,\nExplanation-Guided Exposure Minimization (EGEM), that preemptively prunes\nvariations in the ML model that have not been the subject of positive\nexplanation feedback. Experiments on natural image data demonstrate that our\napproach leads to models that strongly reduce their reliance on hidden Clever\nHans strategies, and consequently achieve higher accuracy on new data.\n","authors":["Lorenz Linhardt","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2304.05727v2.pdf","comment":"18 pages + supplement"},{"id":"http://arxiv.org/abs/2306.06991v2","updated":"2023-10-04T09:10:03Z","published":"2023-06-12T09:38:04Z","title":"Fast Diffusion Model","summary":"  Diffusion models (DMs) have been adopted across diverse fields with its\nremarkable abilities in capturing intricate data distributions. In this paper,\nwe propose a Fast Diffusion Model (FDM) to significantly speed up DMs from a\nstochastic optimization perspective for both faster training and sampling. We\nfirst find that the diffusion process of DMs accords with the stochastic\noptimization process of stochastic gradient descent (SGD) on a stochastic\ntime-variant problem. Then, inspired by momentum SGD that uses both gradient\nand an extra momentum to achieve faster and more stable convergence than SGD,\nwe integrate momentum into the diffusion process of DMs. This comes with a\nunique challenge of deriving the noise perturbation kernel from the\nmomentum-based diffusion process. To this end, we frame the process as a Damped\nOscillation system whose critically damped state -- the kernel solution --\navoids oscillation and yields a faster convergence speed of the diffusion\nprocess. Empirical results show that our FDM can be applied to several popular\nDM frameworks, e.g., VP, VE, and EDM, and reduces their training cost by about\n50% with comparable image synthesis performance on CIFAR-10, FFHQ, and AFHQv2\ndatasets. Moreover, FDM decreases their sampling steps by about 3x to achieve\nsimilar performance under the same samplers. The code is available at\nhttps://github.com/sail-sg/FDM.\n","authors":["Zike Wu","Pan Zhou","Kenji Kawaguchi","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.06991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16066v3","updated":"2023-10-04T09:05:24Z","published":"2023-05-25T13:56:30Z","title":"Guided Attention for Next Active Object @ EGO4D STA Challenge","summary":"  In this technical report, we describe the Guided-Attention mechanism based\nsolution for the short-term anticipation (STA) challenge for the EGO4D\nchallenge. It combines the object detections, and the spatiotemporal features\nextracted from video clips, enhancing the motion and contextual information,\nand further decoding the object-centric and motion-centric information to\naddress the problem of STA in egocentric videos. For the challenge, we build\nour model on top of StillFast with Guided Attention applied on fast network.\nOur model obtains better performance on the validation set and also achieves\nstate-of-the-art (SOTA) results on the challenge test set for EGO4D Short-Term\nObject Interaction Anticipation Challenge.\n","authors":["Sanket Thakur","Cigdem Beyan","Pietro Morerio","Vittorio Murino","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2305.16066v3.pdf","comment":"Winner of CVPR@2023 Ego4D STA challenge. arXiv admin note:\n  substantial text overlap with arXiv:2305.12953"},{"id":"http://arxiv.org/abs/2310.00527v3","updated":"2023-10-04T09:05:17Z","published":"2023-10-01T00:13:06Z","title":"Self-supervised Learning of Contextualized Local Visual Embeddings","summary":"  We present Contextualized Local Visual Embeddings (CLoVE), a self-supervised\nconvolutional-based method that learns representations suited for dense\nprediction tasks. CLoVE deviates from current methods and optimizes a single\nloss function that operates at the level of contextualized local embeddings\nlearned from output feature maps of convolution neural network (CNN) encoders.\nTo learn contextualized embeddings, CLoVE proposes a normalized mult-head\nself-attention layer that combines local features from different parts of an\nimage based on similarity. We extensively benchmark CLoVE's pre-trained\nrepresentations on multiple datasets. CLoVE reaches state-of-the-art\nperformance for CNN-based architectures in 4 dense prediction downstream tasks,\nincluding object detection, instance segmentation, keypoint detection, and\ndense pose estimation.\n","authors":["Thalles Santos Silva","Helio Pedrini","Adín Ramírez Rivera"],"pdf_url":"https://arxiv.org/pdf/2310.00527v3.pdf","comment":"Pre-print. 4th Visual Inductive Priors for Data-Efficient Deep\n  Learning Workshop ICCV 2023. Code at https://github.com/sthalles/CLoVE"},{"id":"http://arxiv.org/abs/2308.08303v2","updated":"2023-10-04T09:04:44Z","published":"2023-08-16T12:07:02Z","title":"Leveraging Next-Active Objects for Context-Aware Anticipation in\n  Egocentric Videos","summary":"  Objects are crucial for understanding human-object interactions. By\nidentifying the relevant objects, one can also predict potential future\ninteractions or actions that may occur with these objects. In this paper, we\nstudy the problem of Short-Term Object interaction anticipation (STA) and\npropose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a\nmulti-modal end-to-end transformer network, that attends to objects in observed\nframes in order to anticipate the next-active-object (NAO) and, eventually, to\nguide the model to predict context-aware future actions. The task is\nchallenging since it requires anticipating future action along with the object\nwith which the action occurs and the time after which the interaction will\nbegin, a.k.a. the time to contact (TTC). Compared to existing video modeling\narchitectures for action anticipation, NAOGAT captures the relationship between\nobjects and the global scene context in order to predict detections for the\nnext active object and anticipate relevant future actions given these\ndetections, leveraging the objects' dynamics to improve accuracy. One of the\nkey strengths of our approach, in fact, is its ability to exploit the motion\ndynamics of objects within a given clip, which is often ignored by other\nmodels, and separately decoding the object-centric and motion-centric\ninformation. Through our experiments, we show that our model outperforms\nexisting methods on two separate datasets, Ego4D and EpicKitchens-100 (\"Unseen\nSet\"), as measured by several additional metrics, such as time to contact, and\nnext-active-object localization. The code will be available upon acceptance.\n","authors":["Sanket Thakur","Cigdem Beyan","Pietro Morerio","Vittorio Murino","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.08303v2.pdf","comment":"Accepted in WACV'24"},{"id":"http://arxiv.org/abs/2310.02664v1","updated":"2023-10-04T09:04:20Z","published":"2023-10-04T09:04:20Z","title":"On Memorization in Diffusion Models","summary":"  Due to their capacity to generate novel and high-quality samples, diffusion\nmodels have attracted significant research interest in recent years. Notably,\nthe typical training objective of diffusion models, i.e., denoising score\nmatching, has a closed-form optimal solution that can only generate training\ndata replicating samples. This indicates that a memorization behavior is\ntheoretically expected, which contradicts the common generalization ability of\nstate-of-the-art diffusion models, and thus calls for a deeper understanding.\nLooking into this, we first observe that memorization behaviors tend to occur\non smaller-sized datasets, which motivates our definition of effective model\nmemorization (EMM), a metric measuring the maximum size of training data at\nwhich a learned diffusion model approximates its theoretical optimum. Then, we\nquantify the impact of the influential factors on these memorization behaviors\nin terms of EMM, focusing primarily on data distribution, model configuration,\nand training procedure. Besides comprehensive empirical results identifying the\ninfluential factors, we surprisingly find that conditioning training data on\nuninformative random labels can significantly trigger the memorization in\ndiffusion models. Our study holds practical significance for diffusion model\nusers and offers clues to theoretical research in deep generative models. Code\nis available at https://github.com/sail-sg/DiffMemorize.\n","authors":["Xiangming Gu","Chao Du","Tianyu Pang","Chongxuan Li","Min Lin","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02663v1","updated":"2023-10-04T08:58:23Z","published":"2023-10-04T08:58:23Z","title":"MedPrompt: Cross-Modal Prompting for Multi-Task Medical Image\n  Translation","summary":"  Cross-modal medical image translation is an essential task for synthesizing\nmissing modality data for clinical diagnosis. However, current learning-based\ntechniques have limitations in capturing cross-modal and global features,\nrestricting their suitability to specific pairs of modalities. This lack of\nversatility undermines their practical usefulness, particularly considering\nthat the missing modality may vary for different cases. In this study, we\npresent MedPrompt, a multi-task framework that efficiently translates different\nmodalities. Specifically, we propose the Self-adaptive Prompt Block, which\ndynamically guides the translation network towards distinct modalities. Within\nthis framework, we introduce the Prompt Extraction Block and the Prompt Fusion\nBlock to efficiently encode the cross-modal prompt. To enhance the extraction\nof global features across diverse modalities, we incorporate the Transformer\nmodel. Extensive experimental results involving five datasets and four pairs of\nmodalities demonstrate that our proposed model achieves state-of-the-art visual\nquality and exhibits excellent generalization capability.\n","authors":["Xuhang Chen","Chi-Man Pun","Shuqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02650v1","updated":"2023-10-04T08:18:30Z","published":"2023-10-04T08:18:30Z","title":"Active Visual Localization for Multi-Agent Collaboration: A Data-Driven\n  Approach","summary":"  Rather than having each newly deployed robot create its own map of its\nsurroundings, the growing availability of SLAM-enabled devices provides the\noption of simply localizing in a map of another robot or device. In cases such\nas multi-robot or human-robot collaboration, localizing all agents in the same\nmap is even necessary. However, localizing e.g. a ground robot in the map of a\ndrone or head-mounted MR headset presents unique challenges due to viewpoint\nchanges. This work investigates how active visual localization can be used to\novercome such challenges of viewpoint changes. Specifically, we focus on the\nproblem of selecting the optimal viewpoint at a given location. We compare\nexisting approaches in the literature with additional proposed baselines and\npropose a novel data-driven approach. The result demonstrates the superior\nperformance of the data-driven approach when compared to existing methods, both\nin controlled simulation experiments and real-world deployment.\n","authors":["Matthew Hanlon","Boyang Sun","Marc Pollefeys","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2310.02650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02642v1","updated":"2023-10-04T08:02:33Z","published":"2023-10-04T08:02:33Z","title":"GET: Group Event Transformer for Event-Based Vision","summary":"  Event cameras are a type of novel neuromorphic sen-sor that has been gaining\nincreasing attention. Existing event-based backbones mainly rely on image-based\ndesigns to extract spatial information within the image transformed from\nevents, overlooking important event properties like time and polarity. To\naddress this issue, we propose a novel Group-based vision Transformer backbone\nfor Event-based vision, called Group Event Transformer (GET), which de-couples\ntemporal-polarity information from spatial infor-mation throughout the feature\nextraction process. Specifi-cally, we first propose a new event representation\nfor GET, named Group Token, which groups asynchronous events based on their\ntimestamps and polarities. Then, GET ap-plies the Event Dual Self-Attention\nblock, and Group Token Aggregation module to facilitate effective feature\ncommu-nication and integration in both the spatial and temporal-polarity\ndomains. After that, GET can be integrated with different downstream tasks by\nconnecting it with vari-ous heads. We evaluate our method on four event-based\nclassification datasets (Cifar10-DVS, N-MNIST, N-CARS, and DVS128Gesture) and\ntwo event-based object detection datasets (1Mpx and Gen1), and the results\ndemonstrate that GET outperforms other state-of-the-art methods. The code is\navailable at https://github.com/Peterande/GET-Group-Event-Transformer.\n","authors":["Yansong Peng","Yueyi Zhang","Zhiwei Xiong","Xiaoyan Sun","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2310.02642v1.pdf","comment":"This paper is accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2310.02641v1","updated":"2023-10-04T08:01:36Z","published":"2023-10-04T08:01:36Z","title":"Deformation-Invariant Neural Network and Its Applications in Distorted\n  Image Restoration and Analysis","summary":"  Images degraded by geometric distortions pose a significant challenge to\nimaging and computer vision tasks such as object recognition. Deep\nlearning-based imaging models usually fail to give accurate performance for\ngeometrically distorted images. In this paper, we propose the\ndeformation-invariant neural network (DINN), a framework to address the problem\nof imaging tasks for geometrically distorted images. The DINN outputs\nconsistent latent features for images that are geometrically distorted but\nrepresent the same underlying object or scene. The idea of DINN is to\nincorporate a simple component, called the quasiconformal transformer network\n(QCTN), into other existing deep networks for imaging tasks. The QCTN is a deep\nneural network that outputs a quasiconformal map, which can be used to\ntransform a geometrically distorted image into an improved version that is\ncloser to the distribution of natural or good images. It first outputs a\nBeltrami coefficient, which measures the quasiconformality of the output\ndeformation map. By controlling the Beltrami coefficient, the local geometric\ndistortion under the quasiconformal mapping can be controlled. The QCTN is\nlightweight and simple, which can be readily integrated into other existing\ndeep neural networks to enhance their performance. Leveraging our framework, we\nhave developed an image classification network that achieves accurate\nclassification of distorted images. Our proposed framework has been applied to\nrestore geometrically distorted images by atmospheric turbulence and water\nturbulence. DINN outperforms existing GAN-based restoration methods under these\nscenarios, demonstrating the effectiveness of the proposed framework.\nAdditionally, we apply our proposed framework to the 1-1 verification of human\nface images under atmospheric turbulence and achieve satisfactory performance,\nfurther demonstrating the efficacy of our approach.\n","authors":["Han Zhang","Qiguang Chen","Lok Ming Lui"],"pdf_url":"https://arxiv.org/pdf/2310.02641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02638v1","updated":"2023-10-04T08:00:05Z","published":"2023-10-04T08:00:05Z","title":"P2CADNet: An End-to-End Reconstruction Network for Parametric 3D CAD\n  Model from Point Clouds","summary":"  Computer Aided Design (CAD), especially the feature-based parametric CAD,\nplays an important role in modern industry and society. However, the\nreconstruction of featured CAD model is more challenging than the\nreconstruction of other CAD models. To this end, this paper proposes an\nend-to-end network to reconstruct featured CAD model from point cloud\n(P2CADNet). Initially, the proposed P2CADNet architecture combines a point\ncloud feature extractor, a CAD sequence reconstructor and a parameter\noptimizer. Subsequently, in order to reconstruct the featured CAD model in an\nautoregressive way, the CAD sequence reconstructor applies two transformer\ndecoders, one with target mask and the other without mask. Finally, for\npredicting parameters more precisely, we design a parameter optimizer with\ncross-attention mechanism to further refine the CAD feature parameters. We\nevaluate P2CADNet on the public dataset, and the experimental results show that\nP2CADNet has excellent reconstruction quality and accuracy. To our best\nknowledge, P2CADNet is the first end-to-end network to reconstruct featured CAD\nmodel from point cloud, and can be regarded as baseline for future works.\nTherefore, we open the source code at https://github.com/Blice0415/P2CADNet.\n","authors":["Zhihao Zong","Fazhi He","Rubin Fan","Yuxin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.10574v2","updated":"2023-10-04T07:52:19Z","published":"2022-07-20T13:37:57Z","title":"Co-Located Human-Human Interaction Analysis using Nonverbal Cues: A\n  Survey","summary":"  Automated co-located human-human interaction analysis has been addressed by\nthe use of nonverbal communication as measurable evidence of social and\npsychological phenomena. We survey the computing studies (since 2010) detecting\nphenomena related to social traits (e.g., leadership, dominance, personality\ntraits), social roles/relations, and interaction dynamics (e.g., group\ncohesion, engagement, rapport). Our target is to identify the nonverbal cues\nand computational methodologies resulting in effective performance. This survey\ndiffers from its counterparts by involving the widest spectrum of social\nphenomena and interaction settings (free-standing conversations, meetings,\ndyads, and crowds). We also present a comprehensive summary of the related\ndatasets and outline future research directions which are regarding the\nimplementation of artificial intelligence, dataset curation, and\nprivacy-preserving interaction analysis. Some major observations are: the most\noften used nonverbal cue, computational method, interaction environment, and\nsensing approach are speaking activity, support vector machines, and meetings\ncomposed of 3-4 persons equipped with microphones and cameras, respectively;\nmultimodal features are prominently performing better; deep learning\narchitectures showed improved performance in overall, but there exist many\nphenomena whose detection has never been implemented through deep models. We\nalso identified several limitations such as the lack of scalable benchmarks,\nannotation reliability tests, cross-dataset experiments, and explainability\nanalysis.\n","authors":["Cigdem Beyan","Alessandro Vinciarelli","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2207.10574v2.pdf","comment":"This is the author's version of the work. It is posted here for your\n  personal use. Not for redistribution. The definitive version was published in\n  ACM Computing Surveys, https://doi.org/10.1145/3626516"},{"id":"http://arxiv.org/abs/2305.16311v2","updated":"2023-10-04T07:38:36Z","published":"2023-05-25T17:59:04Z","title":"Break-A-Scene: Extracting Multiple Concepts from a Single Image","summary":"  Text-to-image model personalization aims to introduce a user-provided concept\nto the model, allowing its synthesis in diverse contexts. However, current\nmethods primarily focus on the case of learning a single concept from multiple\nimages with variations in backgrounds and poses, and struggle when adapted to a\ndifferent scenario. In this work, we introduce the task of textual scene\ndecomposition: given a single image of a scene that may contain several\nconcepts, we aim to extract a distinct text token for each concept, enabling\nfine-grained control over the generated scenes. To this end, we propose\naugmenting the input image with masks that indicate the presence of target\nconcepts. These masks can be provided by the user or generated automatically by\na pre-trained segmentation model. We then present a novel two-phase\ncustomization process that optimizes a set of dedicated textual embeddings\n(handles), as well as the model weights, striking a delicate balance between\naccurately capturing the concepts and avoiding overfitting. We employ a masked\ndiffusion loss to enable handles to generate their assigned concepts,\ncomplemented by a novel loss on cross-attention maps to prevent entanglement.\nWe also introduce union-sampling, a training strategy aimed to improve the\nability of combining multiple concepts in generated images. We use several\nautomatic metrics to quantitatively compare our method against several\nbaselines, and further affirm the results using a user study. Finally, we\nshowcase several applications of our method. Project page is available at:\nhttps://omriavrahami.com/break-a-scene/\n","authors":["Omri Avrahami","Kfir Aberman","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2305.16311v2.pdf","comment":"SIGGRAPH Asia 2023. Project page: at:\n  https://omriavrahami.com/break-a-scene/ Video:\n  https://www.youtube.com/watch?v=-9EA-BhizgM"},{"id":"http://arxiv.org/abs/2309.17264v2","updated":"2023-10-04T07:37:28Z","published":"2023-09-29T14:17:24Z","title":"A Foundation Model for General Moving Object Segmentation in Medical\n  Images","summary":"  Medical image segmentation aims to delineate the anatomical or pathological\nstructures of interest, playing a crucial role in clinical diagnosis. A\nsubstantial amount of high-quality annotated data is crucial for constructing\nhigh-precision deep segmentation models. However, medical annotation is highly\ncumbersome and time-consuming, especially for medical videos or 3D volumes, due\nto the huge labeling space and poor inter-frame consistency. Recently, a\nfundamental task named Moving Object Segmentation (MOS) has made significant\nadvancements in natural images. Its objective is to delineate moving objects\nfrom the background within image sequences, requiring only minimal annotations.\nIn this paper, we propose the first foundation model, named iMOS, for MOS in\nmedical images. Extensive experiments on a large multi-modal medical dataset\nvalidate the effectiveness of the proposed iMOS. Specifically, with the\nannotation of only a small number of images in the sequence, iMOS can achieve\nsatisfactory tracking and segmentation performance of moving objects throughout\nthe entire sequence in bi-directions. We hope that the proposed iMOS can help\naccelerate the annotation speed of experts, and boost the development of\nmedical foundation models.\n","authors":["Zhongnuo Yan","Tong Han","Yuhao Huang","Lian Liu","Han Zhou","Jiongquan Chen","Wenlong Shi","Yan Cao","Xin Yang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2309.17264v2.pdf","comment":"6 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.02611v1","updated":"2023-10-04T06:52:03Z","published":"2023-10-04T06:52:03Z","title":"Analyzing and Improving OT-based Adversarial Networks","summary":"  Optimal Transport (OT) problem aims to find a transport plan that bridges two\ndistributions while minimizing a given cost function. OT theory has been widely\nutilized in generative modeling. In the beginning, OT distance has been used as\na measure for assessing the distance between data and generated distributions.\nRecently, OT transport map between data and prior distributions has been\nutilized as a generative model. These OT-based generative models share a\nsimilar adversarial training objective. In this paper, we begin by unifying\nthese OT-based adversarial methods within a single framework. Then, we\nelucidate the role of each component in training dynamics through a\ncomprehensive analysis of this unified framework. Moreover, we suggest a simple\nbut novel method that improves the previously best-performing OT-based model.\nIntuitively, our approach conducts a gradual refinement of the generated\ndistribution, progressively aligning it with the data distribution. Our\napproach achieves a FID score of 2.51 on CIFAR-10, outperforming unified\nOT-based adversarial approaches.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2310.02611v1.pdf","comment":"20 pages, 13 figures"},{"id":"http://arxiv.org/abs/2310.02601v1","updated":"2023-10-04T06:14:06Z","published":"2023-10-04T06:14:06Z","title":"MagicDrive: Street View Generation with Diverse 3D Geometry Control","summary":"  Recent advancements in diffusion models have significantly enhanced the data\nsynthesis with 2D control. Yet, precise 3D control in street view generation,\ncrucial for 3D perception tasks, remains elusive. Specifically, utilizing\nBird's-Eye View (BEV) as the primary condition often leads to challenges in\ngeometry control (e.g., height), affecting the representation of object shapes,\nocclusion patterns, and road surface elevations, all of which are essential to\nperception data synthesis, especially for 3D object detection tasks. In this\npaper, we introduce MagicDrive, a novel street view generation framework\noffering diverse 3D geometry controls, including camera poses, road maps, and\n3D bounding boxes, together with textual descriptions, achieved through\ntailored encoding strategies. Besides, our design incorporates a cross-view\nattention module, ensuring consistency across multiple camera views. With\nMagicDrive, we achieve high-fidelity street-view synthesis that captures\nnuanced 3D geometry and various scene descriptions, enhancing tasks like BEV\nsegmentation and 3D object detection.\n","authors":["Ruiyuan Gao","Kai Chen","Enze Xie","Lanqing Hong","Zhenguo Li","Dit-Yan Yeung","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02601v1.pdf","comment":"Project Page: https://flymin.github.io/magicdrive"},{"id":"http://arxiv.org/abs/2305.14777v2","updated":"2023-10-04T06:10:48Z","published":"2023-05-24T06:31:05Z","title":"Generative Modeling through the Semi-dual Formulation of Unbalanced\n  Optimal Transport","summary":"  Optimal Transport (OT) problem investigates a transport map that bridges two\ndistributions while minimizing a given cost function. In this regard, OT\nbetween tractable prior distribution and data has been utilized for generative\nmodeling tasks. However, OT-based methods are susceptible to outliers and face\noptimization challenges during training. In this paper, we propose a novel\ngenerative model based on the semi-dual formulation of Unbalanced Optimal\nTransport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution\nmatching. This approach provides better robustness against outliers, stability\nduring training, and faster convergence. We validate these properties\nempirically through experiments. Moreover, we study the theoretical upper-bound\nof divergence between distributions in UOT. Our model outperforms existing\nOT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 5.80\non CelebA-HQ-256. The code is available at\n\\url{https://github.com/Jae-Moo/UOTM}.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2305.14777v2.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2310.02596v1","updated":"2023-10-04T05:59:50Z","published":"2023-10-04T05:59:50Z","title":"SweetDreamer: Aligning Geometric Priors in 2D Diffusion for Consistent\n  Text-to-3D","summary":"  It is inherently ambiguous to lift 2D results from pre-trained diffusion\nmodels to a 3D world for text-to-3D generation. 2D diffusion models solely\nlearn view-agnostic priors and thus lack 3D knowledge during the lifting,\nleading to the multi-view inconsistency problem. We find that this problem\nprimarily stems from geometric inconsistency, and avoiding misplaced geometric\nstructures substantially mitigates the problem in the final outputs. Therefore,\nwe improve the consistency by aligning the 2D geometric priors in diffusion\nmodels with well-defined 3D shapes during the lifting, addressing the vast\nmajority of the problem. This is achieved by fine-tuning the 2D diffusion model\nto be viewpoint-aware and to produce view-specific coordinate maps of\ncanonically oriented 3D objects. In our process, only coarse 3D information is\nused for aligning. This \"coarse\" alignment not only resolves the multi-view\ninconsistency in geometries but also retains the ability in 2D diffusion models\nto generate detailed and diversified high-quality objects unseen in the 3D\ndatasets. Furthermore, our aligned geometric priors (AGP) are generic and can\nbe seamlessly integrated into various state-of-the-art pipelines, obtaining\nhigh generalizability in terms of unseen shapes and visual appearance while\ngreatly alleviating the multi-view inconsistency problem. Our method represents\na new state-of-the-art performance with an 85+% consistency rate by human\nevaluation, while many previous methods are around 30%. Our project page is\nhttps://sweetdreamer3d.github.io/\n","authors":["Weiyu Li","Rui Chen","Xuelin Chen","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2310.02596v1.pdf","comment":"Project page: https://sweetdreamer3d.github.io/"},{"id":"http://arxiv.org/abs/2304.10880v2","updated":"2023-10-04T05:55:05Z","published":"2023-04-21T10:47:13Z","title":"Med-Tuning: Parameter-Efficient Transfer Learning with Fine-Grained\n  Feature Enhancement for Medical Volumetric Segmentation","summary":"  Deep learning-based medical volumetric segmentation methods either train the\nmodel from scratch or follow the standard \"pre-training then fine-tuning\"\nparadigm. Although fine-tuning a pre-trained model on downstream tasks can\nharness its representation power, the standard full fine-tuning is costly in\nterms of computation and memory footprint. In this paper, we present the study\non parameter-efficient transfer learning for medical volumetric segmentation\nand propose a new framework named Med-Tuning based on intra-stage feature\nenhancement and inter-stage feature interaction. Additionally, aiming at\nexploiting the intrinsic global properties of Fourier Transform for\nparameter-efficient transfer learning, a new adapter block namely Med-Adapter\nwith a well-designed Fourier Transform branch is proposed for effectively and\nefficiently modeling the crucial global context for medical volumetric\nsegmentation. Given a large-scale pre-trained model on 2D natural images, our\nmethod can exploit both the crucial spatial multi-scale feature and volumetric\ncorrelations along slices for accurate segmentation. Extensive experiments on\nthree benchmark datasets (including CT and MRI) show that our method can\nachieve better results than previous parameter-efficient transfer learning\nmethods on segmentation tasks, with much less tuned parameter costs. Compared\nto full fine-tuning, our method reduces the finetuned model parameters by up to\n4x, with even better segmentation performance.\n","authors":["Wenxuan Wang","Jiachen Shen","Chen Chen","Jianbo Jiao","Jing Liu","Yan Zhang","Shanshan Song","Jiangyun Li"],"pdf_url":"https://arxiv.org/pdf/2304.10880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02588v1","updated":"2023-10-04T05:09:50Z","published":"2023-10-04T05:09:50Z","title":"ViT-ReciproCAM: Gradient and Attention-Free Visual Explanations for\n  Vision Transformer","summary":"  This paper presents a novel approach to address the challenges of\nunderstanding the prediction process and debugging prediction errors in Vision\nTransformers (ViT), which have demonstrated superior performance in various\ncomputer vision tasks such as image classification and object detection. While\nseveral visual explainability techniques, such as CAM, Grad-CAM, Score-CAM, and\nRecipro-CAM, have been extensively researched for Convolutional Neural Networks\n(CNNs), limited research has been conducted on ViT. Current state-of-the-art\nsolutions for ViT rely on class agnostic Attention-Rollout and Relevance\ntechniques. In this work, we propose a new gradient-free visual explanation\nmethod for ViT, called ViT-ReciproCAM, which does not require attention matrix\nand gradient information. ViT-ReciproCAM utilizes token masking and generated\nnew layer outputs from the target layer's input to exploit the correlation\nbetween activated tokens and network predictions for target classes. Our\nproposed method outperforms the state-of-the-art Relevance method in the\nAverage Drop-Coherence-Complexity (ADCC) metric by $4.58\\%$ to $5.80\\%$ and\ngenerates more localized saliency maps. Our experiments demonstrate the\neffectiveness of ViT-ReciproCAM and showcase its potential for understanding\nand debugging ViT models. Our proposed method provides an efficient and\neasy-to-implement alternative for generating visual explanations, without\nrequiring attention and gradient information, which can be beneficial for\nvarious applications in the field of computer vision.\n","authors":["Seok-Yong Byun","Wonju Lee"],"pdf_url":"https://arxiv.org/pdf/2310.02588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02576v1","updated":"2023-10-04T04:27:16Z","published":"2023-10-04T04:27:16Z","title":"A Prototype-Based Neural Network for Image Anomaly Detection and\n  Localization","summary":"  Image anomaly detection and localization perform not only image-level anomaly\nclassification but also locate pixel-level anomaly regions. Recently, it has\nreceived much research attention due to its wide application in various fields.\nThis paper proposes ProtoAD, a prototype-based neural network for image anomaly\ndetection and localization. First, the patch features of normal images are\nextracted by a deep network pre-trained on nature images. Then, the prototypes\nof the normal patch features are learned by non-parametric clustering. Finally,\nwe construct an image anomaly localization network (ProtoAD) by appending the\nfeature extraction network with $L2$ feature normalization, a $1\\times1$\nconvolutional layer, a channel max-pooling, and a subtraction operation. We use\nthe prototypes as the kernels of the $1\\times1$ convolutional layer; therefore,\nour neural network does not need a training phase and can conduct anomaly\ndetection and localization in an end-to-end manner. Extensive experiments on\ntwo challenging industrial anomaly detection datasets, MVTec AD and BTAD,\ndemonstrate that ProtoAD achieves competitive performance compared to the\nstate-of-the-art methods with a higher inference speed. The source code is\navailable at: https://github.com/98chao/ProtoAD.\n","authors":["Chao Huang","Zhao Kang","Hong Wu"],"pdf_url":"https://arxiv.org/pdf/2310.02576v1.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.02575v1","updated":"2023-10-04T04:26:33Z","published":"2023-10-04T04:26:33Z","title":"AdaMerging: Adaptive Model Merging for Multi-Task Learning","summary":"  Multi-task learning (MTL) aims to empower a model to tackle multiple tasks\nsimultaneously. A recent development known as task arithmetic has revealed that\nseveral models, each fine-tuned for distinct tasks, can be directly merged into\na single model to execute MTL without necessitating a retraining process using\nthe initial training data. Nevertheless, this direct addition of models often\nleads to a significant deterioration in the overall performance of the merged\nmodel. This decline occurs due to potential conflicts and intricate\ncorrelations among the multiple tasks. Consequently, the challenge emerges of\nhow to merge pre-trained models more effectively without using their original\ntraining data. This paper introduces an innovative technique called Adaptive\nModel Merging (AdaMerging). This approach aims to autonomously learn the\ncoefficients for model merging, either in a task-wise or layer-wise manner,\nwithout relying on the original training data. Specifically, our AdaMerging\nmethod operates as an automatic, unsupervised task arithmetic scheme. It\nleverages entropy minimization on unlabeled test samples from the multi-task\nsetup as a surrogate objective function to iteratively refine the merging\ncoefficients of the multiple models. Our experimental findings across eight\ntasks demonstrate the efficacy of the AdaMerging scheme we put forth. Compared\nto the current state-of-the-art task arithmetic merging scheme, AdaMerging\nshowcases a remarkable 11\\% improvement in performance. Notably, AdaMerging\nalso exhibits superior generalization capabilities when applied to unseen\ndownstream tasks. Furthermore, it displays a significantly enhanced robustness\nto data distribution shifts that may occur during the testing phase.\n","authors":["Enneng Yang","Zhenyi Wang","Li Shen","Shiwei Liu","Guibing Guo","Xingwei Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.02575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02569v1","updated":"2023-10-04T04:07:37Z","published":"2023-10-04T04:07:37Z","title":"ReForm-Eval: Evaluating Large Vision Language Models via Unified\n  Re-Formulation of Task-Oriented Benchmarks","summary":"  Recent years have witnessed remarkable progress in the development of large\nvision-language models (LVLMs). Benefiting from the strong language backbones\nand efficient cross-modal alignment strategies, LVLMs exhibit surprising\ncapabilities to perceive visual signals and perform visually grounded\nreasoning. However, the capabilities of LVLMs have not been comprehensively and\nquantitatively evaluate. Most existing multi-modal benchmarks require\ntask-oriented input-output formats, posing great challenges to automatically\nassess the free-form text output of LVLMs. To effectively leverage the\nannotations available in existing benchmarks and reduce the manual effort\nrequired for constructing new benchmarks, we propose to re-formulate existing\nbenchmarks into unified LVLM-compatible formats. Through systematic data\ncollection and reformulation, we present the ReForm-Eval benchmark, offering\nsubstantial data for evaluating various capabilities of LVLMs. Based on\nReForm-Eval, we conduct extensive experiments, thoroughly analyze the strengths\nand weaknesses of existing LVLMs, and identify the underlying factors. Our\nbenchmark and evaluation framework will be open-sourced as a cornerstone for\nadvancing the development of LVLMs.\n","authors":["Zejun Li","Ye Wang","Mengfei Du","Qingwen Liu","Binhao Wu","Jiwen Zhang","Chengxing Zhou","Zhihao Fan","Jie Fu","Jingjing Chen","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02569v1.pdf","comment":"38 pages, 11 figures, 24 tables"},{"id":"http://arxiv.org/abs/2310.02567v1","updated":"2023-10-04T03:59:57Z","published":"2023-10-04T03:59:57Z","title":"Improving Automatic VQA Evaluation Using Large Language Models","summary":"  8 years after the visual question answering (VQA) task was proposed, accuracy\nremains the primary metric for automatic evaluation. VQA Accuracy has been\neffective so far in the IID evaluation setting. However, our community is\nundergoing a shift towards open-ended generative models and OOD evaluation. In\nthis new paradigm, the existing VQA Accuracy metric is overly stringent and\nunderestimates the performance of VQA systems. Thus, there is a need to develop\nmore robust automatic VQA metrics that serve as a proxy for human judgment. In\nthis work, we propose to leverage the in-context learning capabilities of\ninstruction-tuned large language models (LLMs) to build a better VQA metric. We\nformulate VQA evaluation as an answer-rating task where the LLM is instructed\nto score the accuracy of a candidate answer given a set of reference answers.\nWe demonstrate the proposed metric better correlates with human judgment\ncompared to existing metrics across several VQA models and benchmarks. We hope\nwide adoption of our metric will contribute to better estimating the research\nprogress on the VQA task.\n","authors":["Oscar Mañas","Benno Krojer","Aishwarya Agrawal"],"pdf_url":"https://arxiv.org/pdf/2310.02567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01852v2","updated":"2023-10-04T03:48:19Z","published":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by\n  Language-based Semantic Alignment","summary":"  The video-language (VL) pretraining has achieved remarkable improvement in\nmultiple downstream tasks. However, the current VL pretraining framework is\nhard to extend to multiple modalities (N modalities, N>=3) beyond vision and\nlanguage. We thus propose LanguageBind, taking the language as the bind across\ndifferent modalities because the language modality is well-explored and\ncontains rich semantics. Specifically, we freeze the language encoder acquired\nby VL pretraining, then train encoders for other modalities with contrastive\nlearning. As a result, all modalities are mapped to a shared feature space,\nimplementing multi-modal semantic alignment. While LanguageBind ensures that we\ncan extend VL modalities to N modalities, we also need a high-quality dataset\nwith alignment data pairs centered on language. We thus propose VIDAL-10M with\nVideo, Infrared, Depth, Audio and their corresponding Language, naming as\nVIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with\ncomplete semantics rather than truncated segments from long videos, and all the\nvideo, depth, infrared, and audio modalities are aligned to their textual\ndescriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 1.2%\nR@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot\nvideo-text retrieval, validating the high quality of our dataset. Beyond this,\nour LanguageBind has achieved great improvement in the zero-shot video, audio,\ndepth, and infrared understanding tasks. For instance, on the LLVIP and NYU-D\ndatasets, LanguageBind outperforms ImageBind-huge with 23.8% and 11.1% top-1\naccuracy. Code address: https://github.com/PKU-YuanGroup/LanguageBind.\n","authors":["Bin Zhu","Bin Lin","Munan Ning","Yang Yan","Jiaxi Cui","HongFa Wang","Yatian Pang","Wenhao Jiang","Junwu Zhang","Zongwei Li","Wancai Zhang","Zhifeng Li","Wei Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01852v2.pdf","comment":"Under review as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.02557v1","updated":"2023-10-04T03:30:32Z","published":"2023-10-04T03:30:32Z","title":"Generalization in diffusion models arises from geometry-adaptive\n  harmonic representation","summary":"  High-quality samples generated with score-based reverse diffusion algorithms\nprovide evidence that deep neural networks (DNN) trained for denoising can\nlearn high-dimensional densities, despite the curse of dimensionality. However,\nrecent reports of memorization of the training set raise the question of\nwhether these networks are learning the \"true\" continuous density of the data.\nHere, we show that two denoising DNNs trained on non-overlapping subsets of a\ndataset learn nearly the same score function, and thus the same density, with a\nsurprisingly small number of training images. This strong generalization\ndemonstrates an alignment of powerful inductive biases in the DNN architecture\nand/or training algorithm with properties of the data distribution. We analyze\nthese, demonstrating that the denoiser performs a shrinkage operation in a\nbasis adapted to the underlying image. Examination of these bases reveals\noscillating harmonic structures along contours and in homogeneous image\nregions. We show that trained denoisers are inductively biased towards these\ngeometry-adaptive harmonic representations by demonstrating that they arise\neven when the network is trained on image classes such as low-dimensional\nmanifolds, for which the harmonic basis is suboptimal. Additionally, we show\nthat the denoising performance of the networks is near-optimal when trained on\nregular image classes for which the optimal basis is known to be\ngeometry-adaptive and harmonic.\n","authors":["Zahra Kadkhodaie","Florentin Guth","Eero P. Simoncelli","Stéphane Mallat"],"pdf_url":"https://arxiv.org/pdf/2310.02557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02556v1","updated":"2023-10-04T03:30:24Z","published":"2023-10-04T03:30:24Z","title":"NOLA: Networks as Linear Combination of Low Rank Random Basis","summary":"  Large Language Models (LLMs) have recently gained popularity due to their\nimpressive few-shot performance across various downstream tasks. However,\nfine-tuning all parameters and storing a unique model for each downstream task\nor domain becomes impractical because of the massive size of checkpoints (e.g.,\n350GB in GPT-3). Current literature, such as LoRA, showcases the potential of\nlow-rank modifications to the original weights of an LLM, enabling efficient\nadaptation and storage for task-specific models. These methods can reduce the\nnumber of parameters needed to fine-tune an LLM by several orders of magnitude.\nYet, these methods face two primary limitations: 1) the parameter reduction is\nlower-bounded by the rank one decomposition, and 2) the extent of reduction is\nheavily influenced by both the model architecture and the chosen rank. For\ninstance, in larger models, even a rank one decomposition might exceed the\nnumber of parameters truly needed for adaptation. In this paper, we introduce\nNOLA, which overcomes the rank one lower bound present in LoRA. It achieves\nthis by re-parameterizing the low-rank matrices in LoRA using linear\ncombinations of randomly generated matrices (basis) and optimizing the linear\nmixture coefficients only. This approach allows us to decouple the number of\ntrainable parameters from both the choice of rank and the network architecture.\nWe present adaptation results using GPT-2 and ViT in natural language and\ncomputer vision tasks. NOLA performs as well as, or better than models with\nequivalent parameter counts. Furthermore, we demonstrate that we can halve the\nparameters in larger models compared to LoRA with rank one, without sacrificing\nperformance.\n","authors":["Soroush Abbasi Koohpayegani","KL Navaneet","Parsa Nooralinejad","Soheil Kolouri","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2310.02556v1.pdf","comment":"Our code is available here: https://github.com/UCDvision/NOLA"},{"id":"http://arxiv.org/abs/2303.06138v4","updated":"2023-10-04T03:17:03Z","published":"2023-03-10T18:55:46Z","title":"Learning Object-Centric Neural Scattering Functions for Free-Viewpoint\n  Relighting and Scene Composition","summary":"  Photorealistic object appearance modeling from 2D images is a constant topic\nin vision and graphics. While neural implicit methods (such as Neural Radiance\nFields) have shown high-fidelity view synthesis results, they cannot relight\nthe captured objects. More recent neural inverse rendering approaches have\nenabled object relighting, but they represent surface properties as simple\nBRDFs, and therefore cannot handle translucent objects. We propose\nObject-Centric Neural Scattering Functions (OSFs) for learning to reconstruct\nobject appearance from only images. OSFs not only support free-viewpoint object\nrelighting, but also can model both opaque and translucent objects. While\naccurately modeling subsurface light transport for translucent objects can be\nhighly complex and even intractable for neural methods, OSFs learn to\napproximate the radiance transfer from a distant light to an outgoing direction\nat any spatial location. This approximation avoids explicitly modeling complex\nsubsurface scattering, making learning a neural implicit model tractable.\nExperiments on real and synthetic data show that OSFs accurately reconstruct\nappearances for both opaque and translucent objects, allowing faithful\nfree-viewpoint relighting as well as scene composition.\n","authors":["Hong-Xing Yu","Michelle Guo","Alireza Fathi","Yen-Yu Chang","Eric Ryan Chan","Ruohan Gao","Thomas Funkhouser","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2303.06138v4.pdf","comment":"Journal extension of arXiv:2012.08503 (TMLR 2023). The first two\n  authors contributed equally to this work. Project page:\n  https://kovenyu.com/osf/"},{"id":"http://arxiv.org/abs/2310.02544v1","updated":"2023-10-04T02:57:01Z","published":"2023-10-04T02:57:01Z","title":"SlowFormer: Universal Adversarial Patch for Attack on Compute and Energy\n  Efficiency of Inference Efficient Vision Transformers","summary":"  Recently, there has been a lot of progress in reducing the computation of\ndeep models at inference time. These methods can reduce both the computational\nneeds and power usage of deep models. Some of these approaches adaptively scale\nthe compute based on the input instance. We show that such models can be\nvulnerable to a universal adversarial patch attack, where the attacker\noptimizes for a patch that when pasted on any image, can increase the compute\nand power consumption of the model. We run experiments with three different\nefficient vision transformer methods showing that in some cases, the attacker\ncan increase the computation to the maximum possible level by simply pasting a\npatch that occupies only 8\\% of the image area. We also show that a standard\nadversarial training defense method can reduce some of the attack's success. We\nbelieve adaptive efficient methods will be necessary for the future to lower\nthe power usage of deep models, so we hope our paper encourages the community\nto study the robustness of these methods and develop better defense methods for\nthe proposed attack.\n","authors":["KL Navaneet","Soroush Abbasi Koohpayegani","Essam Sleiman","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2310.02544v1.pdf","comment":"Code is available at https://github.com/UCDvision/SlowFormer"},{"id":"http://arxiv.org/abs/2310.01886v2","updated":"2023-10-04T02:30:27Z","published":"2023-10-03T08:39:33Z","title":"Effective and Parameter-Efficient Reusing Fine-Tuned Models","summary":"  Many pre-trained large-scale models provided online have become highly\neffective in transferring to downstream tasks. At the same time, various\ntask-specific models fine-tuned on these pre-trained models are available\nonline for public use. In practice, as collecting task-specific data is\nlabor-intensive and fine-tuning the large pre-trained models is computationally\nexpensive, one can reuse task-specific finetuned models to deal with downstream\ntasks. However, using a model per task causes a heavy burden on storage and\nserving. Recently, many training-free and parameter-efficient methods have been\nproposed for reusing multiple fine-tuned task-specific models into a single\nmulti-task model. However, these methods exhibit a large accuracy gap compared\nwith using a fine-tuned model per task. In this paper, we propose\nParameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing\nFully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task\nvector into a merged model by magnitude pruning. For reusing LoRA fine-tuned\nmodels, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA\nmatrix by singular value decomposition. Both PERUFFT and PERU-LoRA are\ntraining-free. Extensive experiments conducted on computer vision and natural\nlanguage process tasks demonstrate the effectiveness and parameter-efficiency\nof the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform\nexisting reusing model methods by a large margin and achieve comparable\nperformance to using a fine-tuned model per task.\n","authors":["Weisen Jiang","Baijiong Lin","Han Shi","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.01886v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2310.02532v1","updated":"2023-10-04T02:17:59Z","published":"2023-10-04T02:17:59Z","title":"ShaSTA-Fuse: Camera-LiDAR Sensor Fusion to Model Shape and\n  Spatio-Temporal Affinities for 3D Multi-Object Tracking","summary":"  3D multi-object tracking (MOT) is essential for an autonomous mobile agent to\nsafely navigate a scene. In order to maximize the perception capabilities of\nthe autonomous agent, we aim to develop a 3D MOT framework that fuses camera\nand LiDAR sensor information. Building on our prior LiDAR-only work, ShaSTA,\nwhich models shape and spatio-temporal affinities for 3D MOT, we propose a\nnovel camera-LiDAR fusion approach for learning affinities. At its core, this\nwork proposes a fusion technique that generates a rich sensory signal\nincorporating information about depth and distant objects to enhance affinity\nestimation for improved data association, track lifecycle management,\nfalse-positive elimination, false-negative propagation, and track confidence\nscore refinement. Our main contributions include a novel fusion approach for\ncombining camera and LiDAR sensory signals to learn affinities, and a\nfirst-of-its-kind multimodal sequential track confidence refinement technique\nthat fuses 2D and 3D detections. Additionally, we perform an ablative analysis\non each fusion step to demonstrate the added benefits of incorporating the\ncamera sensor, particular for small, distant objects that tend to suffer from\nthe depth-sensing limits and sparsity of LiDAR sensors. In sum, our technique\nachieves state-of-the-art performance on the nuScenes benchmark amongst\nmultimodal 3D MOT algorithms using CenterPoint detections.\n","authors":["Tara Sadjadpour","Rares Ambrus","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2310.02532v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2310.02528v1","updated":"2023-10-04T02:06:48Z","published":"2023-10-04T02:06:48Z","title":"On the Cognition of Visual Question Answering Models and Human\n  Intelligence: A Comparative Study","summary":"  Visual Question Answering (VQA) is a challenging task that requires\ncross-modal understanding and reasoning of visual image and natural language\nquestion. To inspect the association of VQA models to human cognition, we\ndesigned a survey to record human thinking process and analyzed VQA models by\ncomparing the outputs and attention maps with those of humans. We found that\nalthough the VQA models resemble human cognition in architecture and performs\nsimilarly with human on the recognition-level, they still struggle with\ncognitive inferences. The analysis of human thinking procedure serves to direct\nfuture research and introduce more cognitive capacity into modeling features\nand architectures.\n","authors":["Liben Chen","Long Chen","Tian Ellison-Chen","Zhuoyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02528v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.01430v2","updated":"2023-10-04T01:56:32Z","published":"2023-07-04T01:47:34Z","title":"Continual Learning in Open-vocabulary Classification with Complementary\n  Memory Systems","summary":"  We introduce a method for flexible and efficient continual learning in\nopen-vocabulary image classification, drawing inspiration from the\ncomplementary learning systems observed in human cognition. Specifically, we\npropose to combine predictions from a CLIP zero-shot model and the\nexemplar-based model, using the zero-shot estimated probability that a sample's\nclass is within the exemplar classes. We also propose a \"tree probe\" method, an\nadaption of lazy learning principles, which enables fast learning from new\nexamples with competitive accuracy to batch-trained linear models. We test in\ndata incremental, class incremental, and task incremental settings, as well as\nability to perform flexible inference on varying subsets of zero-shot and\nlearned categories. Our proposed method achieves a good balance of learning\nspeed, target task effectiveness, and zero-shot effectiveness. Code will be\navailable at https://github.com/jessemelpolio/TreeProbe.\n","authors":["Zhen Zhu","Weijie Lyu","Yao Xiao","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2307.01430v2.pdf","comment":"In review"},{"id":"http://arxiv.org/abs/2310.02523v1","updated":"2023-10-04T01:47:36Z","published":"2023-10-04T01:47:36Z","title":"A Spatio-Temporal Attention-Based Method for Detecting Student Classroom\n  Behaviors","summary":"  Accurately detecting student behavior from classroom videos is beneficial for\nanalyzing their classroom status and improving teaching efficiency. However,\nlow accuracy in student classroom behavior detection is a prevalent issue. To\naddress this issue, we propose a Spatio-Temporal Attention-Based Method for\nDetecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is\nused to generate motion and environmental information feature maps from the\nvideo. Then, the spatio-temporal attention module is applied to the feature\nmaps, including information aggregation, compression and stimulation processes.\nSubsequently, attention maps in the time, channel and space dimensions are\nobtained, and multi-label behavior classification is performed based on these\nattention maps. To solve the long-tail data problem that exists in student\nclassroom behavior datasets, we use an improved focal loss function to assign\nmore weight to the tail class data during training. Experimental results are\nconducted on a self-made student classroom behavior dataset named STSCB.\nCompared with the SlowFast model, the average accuracy of student behavior\nclassification detection improves by 8.94\\% using BDSTA.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02522v1","updated":"2023-10-04T01:43:46Z","published":"2023-10-04T01:43:46Z","title":"SCB-Dataset3: A Benchmark for Detecting Student Classroom Behavior","summary":"  The use of deep learning methods to automatically detect students' classroom\nbehavior is a promising approach for analyzing their class performance and\nimproving teaching effectiveness. However, the lack of publicly available\ndatasets on student behavior poses a challenge for researchers in this field.\nTo address this issue, we propose the Student Classroom Behavior dataset\n(SCB-dataset3), which represents real-life scenarios. Our dataset comprises\n5686 images with 45578 labels, focusing on six behaviors: hand-raising,\nreading, writing, using a phone, bowing the head, and leaning over the table.\nWe evaluated the dataset using the YOLOv5, YOLOv7, and YOLOv8 algorithms,\nachieving a mean average precision (map) of up to 80.3$\\%$. We believe that our\ndataset can serve as a robust foundation for future research in student\nbehavior detection and contribute to advancements in this field. Our\nSCB-dataset3 is available for download at:\nhttps://github.com/Whiffe/SCB-dataset\n","authors":["Fan Yang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02522v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.02488,\n  arXiv:2306.03318"},{"id":"http://arxiv.org/abs/2305.03048v2","updated":"2023-10-04T01:15:21Z","published":"2023-05-04T17:59:36Z","title":"Personalize Segment Anything Model with One Shot","summary":"  Driven by large-data pre-training, Segment Anything Model (SAM) has been\ndemonstrated as a powerful and promptable framework, revolutionizing the\nsegmentation models. Despite the generality, customizing SAM for specific\nvisual concepts without man-powered prompting is under explored, e.g.,\nautomatically segmenting your pet dog in different images. In this paper, we\npropose a training-free Personalization approach for SAM, termed as PerSAM.\nGiven only a single image with a reference mask, PerSAM first localizes the\ntarget concept by a location prior, and segments it within other images or\nvideos via three techniques: target-guided attention, target-semantic\nprompting, and cascaded post-refinement. In this way, we effectively adapt SAM\nfor private use without any training. To further alleviate the mask ambiguity,\nwe present an efficient one-shot fine-tuning variant, PerSAM-F. Freezing the\nentire SAM, we introduce two learnable weights for multi-scale masks, only\ntraining 2 parameters within 10 seconds for improved performance. To\ndemonstrate our efficacy, we construct a new segmentation dataset, PerSeg, for\npersonalized evaluation, and test our methods on video object segmentation with\ncompetitive performance. Besides, our approach can also enhance DreamBooth to\npersonalize Stable Diffusion for text-to-image generation, which discards the\nbackground disturbance for better target appearance learning. Code is released\nat https://github.com/ZrrSkywalker/Personalize-SAM\n","authors":["Renrui Zhang","Zhengkai Jiang","Ziyu Guo","Shilin Yan","Junting Pan","Xianzheng Ma","Hao Dong","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.03048v2.pdf","comment":"Code is available at https://github.com/ZrrSkywalker/Personalize-SAM"},{"id":"http://arxiv.org/abs/2310.00199v2","updated":"2023-10-04T01:00:42Z","published":"2023-09-30T00:33:41Z","title":"DeformUX-Net: Exploring a 3D Foundation Backbone for Medical Image\n  Segmentation with Depthwise Deformable Convolution","summary":"  The application of 3D ViTs to medical image segmentation has seen remarkable\nstrides, somewhat overshadowing the budding advancements in Convolutional\nNeural Network (CNN)-based models. Large kernel depthwise convolution has\nemerged as a promising technique, showcasing capabilities akin to hierarchical\ntransformers and facilitating an expansive effective receptive field (ERF)\nvital for dense predictions. Despite this, existing core operators, ranging\nfrom global-local attention to large kernel convolution, exhibit inherent\ntrade-offs and limitations (e.g., global-local range trade-off, aggregating\nattentional features). We hypothesize that deformable convolution can be an\nexploratory alternative to combine all advantages from the previous operators,\nproviding long-range dependency, adaptive spatial aggregation and computational\nefficiency as a foundation backbone. In this work, we introduce 3D\nDeformUX-Net, a pioneering volumetric CNN model that adeptly navigates the\nshortcomings traditionally associated with ViTs and large kernel convolution.\nSpecifically, we revisit volumetric deformable convolution in depth-wise\nsetting to adapt long-range dependency with computational efficiency. Inspired\nby the concepts of structural re-parameterization for convolution kernel\nweights, we further generate the deformable tri-planar offsets by adapting a\nparallel branch (starting from $1\\times1\\times1$ convolution), providing\nadaptive spatial aggregation across all channels. Our empirical evaluations\nreveal that the 3D DeformUX-Net consistently outperforms existing\nstate-of-the-art ViTs and large kernel convolution models across four\nchallenging public datasets, spanning various scales from organs (KiTS: 0.680\nto 0.720, MSD Pancreas: 0.676 to 0.717, AMOS: 0.871 to 0.902) to vessels (e.g.,\nMSD hepatic vessels: 0.635 to 0.671) in mean Dice.\n","authors":["Ho Hin Lee","Quan Liu","Qi Yang","Xin Yu","Shunxing Bao","Yuankai Huo","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2310.00199v2.pdf","comment":"14 pages, the source code with our pre-trained model is available at\n  this https://github.com/MASILab/deform-uxnet"},{"id":"http://arxiv.org/abs/2309.13570v2","updated":"2023-10-04T00:51:32Z","published":"2023-09-24T07:06:45Z","title":"Towards Robust Mobile Digital-Twin Tracking via An RGBD-based\n  Transformer Model and A Comprehensive Mobile Dataset","summary":"  The potential of digital-twin technology, involving the creation of precise\ndigital replicas of physical objects, to reshape AR experiences in 3D object\ntracking and localization scenarios is significant. However, enabling robust 3D\nobject tracking in dynamic mobile AR environments remains a formidable\nchallenge. These scenarios often require a more robust pose estimator capable\nof handling the inherent sensor-level measurement noise. In this paper,\nrecognizing the challenges of comprehensive solutions in existing literature,\nwe propose a transformer-based 6DoF pose estimator designed to achieve\nstate-of-the-art accuracy under real-world noisy data. To systematically\nvalidate the new solution's performance against the prior art, we also\nintroduce a novel RGBD dataset called Digital Twin Tracking Dataset (DTTD) v2,\nwhich is focused on digital-twin object tracking scenarios. Expanded from an\nexisting DTTD v1, the new dataset adds digital-twin data captured using a\ncutting-edge mobile RGBD sensor suite on Apple iPhone 14 Pro, expanding the\napplicability of our approach to iPhone sensor data. Through extensive\nexperimentation and in-depth analysis, we illustrate the effectiveness of our\nmethods under significant depth data errors, surpassing the performance of\nexisting baselines. Code is made publicly available at:\nhttps://github.com/augcog/Robust-Digital-Twin-Tracking.\n","authors":["Zixun Huang","Keling Yao","Seth Z. Zhao","Chuanyu Pan","Tianjian Xu","Weiyu Feng","Allen Y. Yang"],"pdf_url":"https://arxiv.org/pdf/2309.13570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07136v3","updated":"2023-10-04T00:15:50Z","published":"2022-06-14T19:49:44Z","title":"Automatic Clipping: Differentially Private Deep Learning Made Easier and\n  Stronger","summary":"  Per-example gradient clipping is a key algorithmic step that enables\npractical differential private (DP) training for deep learning models. The\nchoice of clipping threshold R, however, is vital for achieving high accuracy\nunder DP. We propose an easy-to-use replacement, called automatic clipping,\nthat eliminates the need to tune R for any DP optimizers, including DP-SGD,\nDP-Adam, DP-LAMB and many others. The automatic variants are as private and\ncomputationally efficient as existing DP optimizers, but require no DP-specific\nhyperparameters and thus make DP training as amenable as the standard\nnon-private training. We give a rigorous convergence analysis of automatic\nDP-SGD in the non-convex setting, showing that it can enjoy an asymptotic\nconvergence rate that matches the standard SGD, under a symmetric gradient\nnoise assumption of the per-sample gradients (commonly used in the non-DP\nliterature). We demonstrate on various language and vision tasks that automatic\nclipping outperforms or matches the state-of-the-art, and can be easily\nemployed with minimal changes to existing codebases.\n","authors":["Zhiqi Bu","Yu-Xiang Wang","Sheng Zha","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2206.07136v3.pdf","comment":"accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.03211v1","updated":"2023-10-04T23:33:36Z","published":"2023-10-04T23:33:36Z","title":"On the Performance of Multimodal Language Models","summary":"  Instruction-tuned large language models (LLMs) have demonstrated promising\nzero-shot generalization capabilities across various downstream tasks. Recent\nresearch has introduced multimodal capabilities to LLMs by integrating\nindependently pretrained vision encoders through model grafting. These\nmultimodal variants undergo instruction tuning, similar to LLMs, enabling\neffective zero-shot generalization for multimodal tasks. This study conducts a\ncomparative analysis of different multimodal instruction tuning approaches and\nevaluates their performance across a range of tasks, including complex\nreasoning, conversation, image captioning, multiple-choice questions (MCQs),\nand binary classification. Through rigorous benchmarking and ablation\nexperiments, we reveal key insights for guiding architectural choices when\nincorporating multimodal capabilities into LLMs. However, current approaches\nhave limitations; they do not sufficiently address the need for a diverse\nmultimodal instruction dataset, which is crucial for enhancing task\ngeneralization. Additionally, they overlook issues related to truthfulness and\nfactuality when generating responses. These findings illuminate current\nmethodological constraints in adapting language models for image comprehension\nand provide valuable guidance for researchers and practitioners seeking to\nharness multimodal versions of LLMs.\n","authors":["Utsav Garg","Erhan Bas"],"pdf_url":"https://arxiv.org/pdf/2310.03211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03205v1","updated":"2023-10-04T23:24:22Z","published":"2023-10-04T23:24:22Z","title":"A Large-Scale 3D Face Mesh Video Dataset via Neural Re-parameterized\n  Optimization","summary":"  We propose NeuFace, a 3D face mesh pseudo annotation method on videos via\nneural re-parameterized optimization. Despite the huge progress in 3D face\nreconstruction methods, generating reliable 3D face labels for in-the-wild\ndynamic videos remains challenging. Using NeuFace optimization, we annotate the\nper-view/-frame accurate and consistent face meshes on large-scale face videos,\ncalled the NeuFace-dataset. We investigate how neural re-parameterization helps\nto reconstruct image-aligned facial details on 3D meshes via gradient analysis.\nBy exploiting the naturalness and diversity of 3D faces in our dataset, we\ndemonstrate the usefulness of our dataset for 3D face-related tasks: improving\nthe reconstruction accuracy of an existing 3D face reconstruction model and\nlearning 3D facial motion prior. Code and datasets will be available at\nhttps://neuface-dataset.github.\n","authors":["Kim Youwang","Lee Hyun","Kim Sung-Bin","Suekyeong Nam","Janghoon Ju","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2310.03205v1.pdf","comment":"9 pages, 7 figures, and 3 tables for the main paper. 8 pages, 6\n  figures and 3 tables for the appendix"},{"id":"http://arxiv.org/abs/2307.11932v2","updated":"2023-10-04T22:57:04Z","published":"2023-07-21T22:39:41Z","title":"RIC: Rotate-Inpaint-Complete for Generalizable Scene Reconstruction","summary":"  General scene reconstruction refers to the task of estimating the full 3D\ngeometry and texture of a scene containing previously unseen objects. In many\npractical applications such as AR/VR, autonomous navigation, and robotics, only\na single view of the scene may be available, making the scene reconstruction\ntask challenging. In this paper, we present a method for scene reconstruction\nby structurally breaking the problem into two steps: rendering novel views via\ninpainting and 2D to 3D scene lifting. Specifically, we leverage the\ngeneralization capability of large visual language models (Dalle-2) to inpaint\nthe missing areas of scene color images rendered from different views. Next, we\nlift these inpainted images to 3D by predicting normals of the inpainted image\nand solving for the missing depth values. By predicting for normals instead of\ndepth directly, our method allows for robustness to changes in depth\ndistributions and scale. With rigorous quantitative evaluation, we show that\nour method outperforms multiple baselines while providing generalization to\nnovel objects and scenes.\n","authors":["Isaac Kasahara","Shubham Agrawal","Selim Engin","Nikhil Chavan-Dafle","Shuran Song","Volkan Isler"],"pdf_url":"https://arxiv.org/pdf/2307.11932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13777v2","updated":"2023-10-04T22:40:47Z","published":"2023-09-24T23:16:38Z","title":"Diffeomorphic Multi-Resolution Deep Learning Registration for\n  Applications in Breast MRI","summary":"  In breast surgical planning, accurate registration of MR images across\npatient positions has the potential to improve the localisation of tumours\nduring breast cancer treatment. While learning-based registration methods have\nrecently become the state-of-the-art approach for most medical image\nregistration tasks, these methods have yet to make inroads into breast image\nregistration due to certain difficulties-the lack of rich texture information\nin breast MR images and the need for the deformations to be diffeomophic. In\nthis work, we propose learning strategies for breast MR image registration that\nare amenable to diffeomorphic constraints, together with early experimental\nresults from in-silico and in-vivo experiments. One key contribution of this\nwork is a registration network which produces superior registration outcomes\nfor breast images in addition to providing diffeomorphic guarantees.\n","authors":["Matthew G. French","Gonzalo D. Maso Talou","Thiranja P. Babarenda Gamage","Martyn P. Nash","Poul M. Nielsen","Anthony J. Doyle","Juan Eugenio Iglesias","Yaël Balbastre","Sean I. Young"],"pdf_url":"https://arxiv.org/pdf/2309.13777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17455v2","updated":"2023-10-04T22:11:50Z","published":"2023-05-27T12:07:21Z","title":"CrossGET: Cross-Guided Ensemble of Tokens for Accelerating\n  Vision-Language Transformers","summary":"  Recent vision-language models have achieved tremendous progress far beyond\nwhat we ever expected. However, their computational costs are also dramatically\ngrowing with rapid development, especially for the large models. It makes model\nacceleration exceedingly critical in a scenario of limited resources. Although\nextensively studied for unimodal models, the acceleration for multimodal\nmodels, especially the vision-language Transformers, is relatively\nunder-explored. To pursue more efficient and accessible vision-language\nTransformers, this paper introduces \\textbf{Cross}-\\textbf{G}uided\n\\textbf{E}nsemble of \\textbf{T}okens (\\textbf{\\emph{CrossGET}}), a universal\nacceleration framework for vision-language Transformers. This framework\nadaptively combines tokens through real-time, cross-modal guidance, thereby\nachieving substantial acceleration while keeping high performance.\n\\textit{CrossGET} has two key innovations: 1) \\textit{Cross-Guided Matching and\nEnsemble}. \\textit{CrossGET} incorporates cross-modal guided token matching and\nensemble to exploit cross-modal information effectively, only introducing\ncross-modal tokens with negligible extra parameters. 2) \\textit{Complete-Graph\nSoft Matching}. In contrast to the existing bipartite soft matching approach,\n\\textit{CrossGET} introduces a complete-graph soft matching policy to achieve\nmore reliable token-matching results while maintaining parallelizability and\nhigh efficiency. Extensive experiments are conducted on various vision-language\ntasks, including image-text retrieval, visual reasoning, image captioning, and\nvisual question answering. Performance on both classic multimodal architectures\nand emerging multimodal LLMs demonstrate the effectiveness and versatility of\nthe proposed \\textit{CrossGET} framework. The code will be at\n\\url{https://github.com/sdc17/CrossGET}.\n","authors":["Dachuan Shi","Chaofan Tao","Anyi Rao","Zhendong Yang","Chun Yuan","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2305.17455v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2110.15497v4","updated":"2023-10-04T22:05:42Z","published":"2021-10-29T02:32:44Z","title":"Unsupervised Foreground Extraction via Deep Region Competition","summary":"  We present Deep Region Competition (DRC), an algorithm designed to extract\nforeground objects from images in a fully unsupervised manner. Foreground\nextraction can be viewed as a special case of generic image segmentation that\nfocuses on identifying and disentangling objects from the background. In this\nwork, we rethink the foreground extraction by reconciling energy-based prior\nwith generative image modeling in the form of Mixture of Experts (MoE), where\nwe further introduce the learned pixel re-assignment as the essential inductive\nbias to capture the regularities of background regions. With this modeling, the\nforeground-background partition can be naturally found through\nExpectation-Maximization (EM). We show that the proposed method effectively\nexploits the interaction between the mixture components during the partitioning\nprocess, which closely connects to region competition, a seminal approach for\ngeneric image segmentation. Experiments demonstrate that DRC exhibits more\ncompetitive performances on complex real-world data and challenging\nmulti-object scenes compared with prior methods. Moreover, we show empirically\nthat DRC can potentially generalize to novel foreground objects even from\ncategories unseen during training.\n","authors":["Peiyu Yu","Sirui Xie","Xiaojian Ma","Yixin Zhu","Ying Nian Wu","Song-Chun Zhu"],"pdf_url":"https://arxiv.org/pdf/2110.15497v4.pdf","comment":"NeurIPS 2021"},{"id":"http://arxiv.org/abs/2310.03182v1","updated":"2023-10-04T21:57:09Z","published":"2023-10-04T21:57:09Z","title":"Robust and Interpretable Medical Image Classifiers via Concept\n  Bottleneck Models","summary":"  Medical image classification is a critical problem for healthcare, with the\npotential to alleviate the workload of doctors and facilitate diagnoses of\npatients. However, two challenges arise when deploying deep learning models to\nreal-world healthcare applications. First, neural models tend to learn spurious\ncorrelations instead of desired features, which could fall short when\ngeneralizing to new domains (e.g., patients with different ages). Second, these\nblack-box models lack interpretability. When making diagnostic predictions, it\nis important to understand why a model makes a decision for trustworthy and\nsafety considerations. In this paper, to address these two limitations, we\npropose a new paradigm to build robust and interpretable medical image\nclassifiers with natural language concepts. Specifically, we first query\nclinical concepts from GPT-4, then transform latent image features into\nexplicit concepts with a vision-language model. We systematically evaluate our\nmethod on eight medical image classification datasets to verify its\neffectiveness. On challenging datasets with strong confounding factors, our\nmethod can mitigate spurious correlations thus substantially outperform\nstandard visual encoders and other baselines. Finally, we show how\nclassification with a small number of concepts brings a level of\ninterpretability for understanding model decisions through case studies in real\nmedical data.\n","authors":["An Yan","Yu Wang","Yiwu Zhong","Zexue He","Petros Karypis","Zihan Wang","Chengyu Dong","Amilcare Gentili","Chun-Nan Hsu","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2310.03182v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.16150v2","updated":"2023-10-04T21:20:18Z","published":"2023-08-30T17:16:02Z","title":"Modality Cycles with Masked Conditional Diffusion for Unsupervised\n  Anomaly Segmentation in MRI","summary":"  Unsupervised anomaly segmentation aims to detect patterns that are distinct\nfrom any patterns processed during training, commonly called abnormal or\nout-of-distribution patterns, without providing any associated manual\nsegmentations. Since anomalies during deployment can lead to model failure,\ndetecting the anomaly can enhance the reliability of models, which is valuable\nin high-risk domains like medical imaging. This paper introduces Masked\nModality Cycles with Conditional Diffusion (MMCCD), a method that enables\nsegmentation of anomalies across diverse patterns in multimodal MRI. The method\nis based on two fundamental ideas. First, we propose the use of cyclic modality\ntranslation as a mechanism for enabling abnormality detection.\nImage-translation models learn tissue-specific modality mappings, which are\ncharacteristic of tissue physiology. Thus, these learned mappings fail to\ntranslate tissues or image patterns that have never been encountered during\ntraining, and the error enables their segmentation. Furthermore, we combine\nimage translation with a masked conditional diffusion model, which attempts to\n`imagine' what tissue exists under a masked area, further exposing unknown\npatterns as the generative model fails to recreate them. We evaluate our method\non a proxy task by training on healthy-looking slices of BraTS2021\nmulti-modality MRIs and testing on slices with tumors. We show that our method\ncompares favorably to previous unsupervised approaches based on image\nreconstruction and denoising with autoencoders and diffusion models.\n","authors":["Ziyun Liang","Harry Anthony","Felix Wagner","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16150v2.pdf","comment":"Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI\n  2023"},{"id":"http://arxiv.org/abs/2310.01662v2","updated":"2023-10-04T20:39:08Z","published":"2023-10-02T21:52:47Z","title":"SYRAC: Synthesize, Rank, and Count","summary":"  Crowd counting is a critical task in computer vision, with several important\napplications. However, existing counting methods rely on labor-intensive\ndensity map annotations, necessitating the manual localization of each\nindividual pedestrian. While recent efforts have attempted to alleviate the\nannotation burden through weakly or semi-supervised learning, these approaches\nfall short of significantly reducing the workload. We propose a novel approach\nto eliminate the annotation burden by leveraging latent diffusion models to\ngenerate synthetic data. However, these models struggle to reliably understand\nobject quantities, leading to noisy annotations when prompted to produce images\nwith a specific quantity of objects. To address this, we use latent diffusion\nmodels to create two types of synthetic data: one by removing pedestrians from\nreal images, which generates ranked image pairs with a weak but reliable object\nquantity signal, and the other by generating synthetic images with a\npredetermined number of objects, offering a strong but noisy counting signal.\nOur method utilizes the ranking image pairs for pre-training and then fits a\nlinear layer to the noisy synthetic images using these crowd quantity features.\nWe report state-of-the-art results for unsupervised crowd counting.\n","authors":["Adriano D'Alessandro","Ali Mahdavi-Amiri","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2310.01662v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03149v1","updated":"2023-10-04T20:26:59Z","published":"2023-10-04T20:26:59Z","title":"Attributing Learned Concepts in Neural Networks to Training Data","summary":"  By now there is substantial evidence that deep learning models learn certain\nhuman-interpretable features as part of their internal representations of data.\nAs having the right (or wrong) concepts is critical to trustworthy machine\nlearning systems, it is natural to ask which inputs from the model's original\ntraining set were most important for learning a concept at a given layer. To\nanswer this, we combine data attribution methods with methods for probing the\nconcepts learned by a model. Training network and probe ensembles for two\nconcept datasets on a range of network layers, we use the recently developed\nTRAK method for large-scale data attribution. We find some evidence for\nconvergence, where removing the 10,000 top attributing images for a concept and\nretraining the model does not change the location of the concept in the network\nnor the probing sparsity of the concept. This suggests that rather than being\nhighly dependent on a few specific examples, the features that inform the\ndevelopment of a concept are spread in a more diffuse manner across its\nexemplars, implying robustness in concept formation.\n","authors":["Nicholas Konz","Charles Godfrey","Madelyn Shapiro","Jonathan Tu","Henry Kvinge","Davis Brown"],"pdf_url":"https://arxiv.org/pdf/2310.03149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03140v1","updated":"2023-10-04T20:05:40Z","published":"2023-10-04T20:05:40Z","title":"ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time\n  Measurements","summary":"  Tracking subjects in videos is one of the most widely used functions in\ncamera-based IoT applications such as security surveillance, smart city traffic\nsafety enhancement, vehicle to pedestrian communication and so on. In the\ncomputer vision domain, tracking is usually achieved by first detecting\nsubjects with bounding boxes, then associating detected bounding boxes across\nvideo frames. For many IoT systems, images captured by cameras are usually sent\nover the network to be processed at a different site that has more powerful\ncomputing resources than edge devices. However, sending entire frames through\nthe network causes significant bandwidth consumption that may exceed the system\nbandwidth constraints. To tackle this problem, we propose ViFiT, a\ntransformer-based model that reconstructs vision bounding box trajectories from\nphone data (IMU and Fine Time Measurements). It leverages a transformer ability\nof better modeling long-term time series data. ViFiT is evaluated on Vi-Fi\nDataset, a large-scale multimodal dataset in 5 diverse real world scenes,\nincluding indoor and outdoor environments. To fill the gap of proper metrics of\njointly capturing the system characteristics of both tracking quality and video\nbandwidth reduction, we propose a novel evaluation framework dubbed Minimum\nRequired Frames (MRF) and Minimum Required Frames Ratio (MRFR). ViFiT achieves\nan MRFR of 0.65 that outperforms the state-of-the-art approach for cross-modal\nreconstruction in LSTM Encoder-Decoder architecture X-Translator of 0.98,\nresulting in a high frame reduction rate as 97.76%.\n","authors":["Bryan Bo Cao","Abrar Alali","Hansi Liu","Nicholas Meegan","Marco Gruteser","Kristin Dana","Ashwin Ashok","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2310.03140v1.pdf","comment":"22 pages, 12 figures, 9 tables. MobiCom 2023 ISACom"},{"id":"http://arxiv.org/abs/2310.03125v1","updated":"2023-10-04T19:35:56Z","published":"2023-10-04T19:35:56Z","title":"Shielding the Unseen: Privacy Protection through Poisoning NeRF with\n  Spatial Deformation","summary":"  In this paper, we introduce an innovative method of safeguarding user privacy\nagainst the generative capabilities of Neural Radiance Fields (NeRF) models.\nOur novel poisoning attack method induces changes to observed views that are\nimperceptible to the human eye, yet potent enough to disrupt NeRF's ability to\naccurately reconstruct a 3D scene. To achieve this, we devise a bi-level\noptimization algorithm incorporating a Projected Gradient Descent (PGD)-based\nspatial deformation. We extensively test our approach on two common NeRF\nbenchmark datasets consisting of 29 real-world scenes with high-quality images.\nOur results compellingly demonstrate that our privacy-preserving method\nsignificantly impairs NeRF's performance across these benchmark datasets.\nAdditionally, we show that our method is adaptable and versatile, functioning\nacross various perturbation strengths and NeRF architectures. This work offers\nvaluable insights into NeRF's vulnerabilities and emphasizes the need to\naccount for such potential privacy risks when developing robust 3D scene\nreconstruction algorithms. Our study contributes to the larger conversation\nsurrounding responsible AI and generative machine learning, aiming to protect\nuser privacy and respect creative ownership in the digital age.\n","authors":["Yihan Wu","Brandon Y. Feng","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2310.03125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03118v1","updated":"2023-10-04T19:13:16Z","published":"2023-10-04T19:13:16Z","title":"Blind CT Image Quality Assessment Using DDPM-derived Content and\n  Transformer-based Evaluator","summary":"  Lowering radiation dose per view and utilizing sparse views per scan are two\ncommon CT scan modes, albeit often leading to distorted images characterized by\nnoise and streak artifacts. Blind image quality assessment (BIQA) strives to\nevaluate perceptual quality in alignment with what radiologists perceive, which\nplays an important role in advancing low-dose CT reconstruction techniques. An\nintriguing direction involves developing BIQA methods that mimic the\noperational characteristic of the human visual system (HVS). The internal\ngenerative mechanism (IGM) theory reveals that the HVS actively deduces primary\ncontent to enhance comprehension. In this study, we introduce an innovative\nBIQA metric that emulates the active inference process of IGM. Initially, an\nactive inference module, implemented as a denoising diffusion probabilistic\nmodel (DDPM), is constructed to anticipate the primary content. Then, the\ndissimilarity map is derived by assessing the interrelation between the\ndistorted image and its primary content. Subsequently, the distorted image and\ndissimilarity map are combined into a multi-channel image, which is inputted\ninto a transformer-based image quality evaluator. Remarkably, by exclusively\nutilizing this transformer-based quality evaluator, we won the second place in\nthe MICCAI 2023 low-dose computed tomography perceptual image quality\nassessment grand challenge. Leveraging the DDPM-derived primary content, our\napproach further improves the performance on the challenge dataset.\n","authors":["Yongyi Shi","Wenjun Xia","Ge Wang","Xuanqin Mou"],"pdf_url":"https://arxiv.org/pdf/2310.03118v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.03108v1","updated":"2023-10-04T18:58:47Z","published":"2023-10-04T18:58:47Z","title":"Reinforcement Learning-based Mixture of Vision Transformers for Video\n  Violence Recognition","summary":"  Video violence recognition based on deep learning concerns accurate yet\nscalable human violence recognition. Currently, most state-of-the-art video\nviolence recognition studies use CNN-based models to represent and categorize\nvideos. However, recent studies suggest that pre-trained transformers are more\naccurate than CNN-based models on various video analysis benchmarks. Yet these\nmodels are not thoroughly evaluated for video violence recognition. This paper\nintroduces a novel transformer-based Mixture of Experts (MoE) video violence\nrecognition system. Through an intelligent combination of large vision\ntransformers and efficient transformer architectures, the proposed system not\nonly takes advantage of the vision transformer architecture but also reduces\nthe cost of utilizing large vision transformers. The proposed architecture\nmaximizes violence recognition system accuracy while actively reducing\ncomputational costs through a reinforcement learning-based router. The\nempirical results show the proposed MoE architecture's superiority over\nCNN-based models by achieving 92.4% accuracy on the RWF dataset.\n","authors":["Hamid Mohammadi","Ehsan Nazerfard","Tahereh Firoozi"],"pdf_url":"https://arxiv.org/pdf/2310.03108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.00696v2","updated":"2023-10-04T18:56:52Z","published":"2021-02-01T08:30:42Z","title":"Numerical Weather Forecasting using Convolutional-LSTM with Attention\n  and Context Matcher Mechanisms","summary":"  Numerical weather forecasting using high-resolution physical models often\nrequires extensive computational resources on supercomputers, which diminishes\ntheir wide usage in most real-life applications. As a remedy, applying deep\nlearning methods has revealed innovative solutions within this field. To this\nend, we introduce a novel deep learning architecture for forecasting\nhigh-resolution spatio-temporal weather data. Our approach extends the\nconventional encoder-decoder structure by integrating Convolutional Long-short\nTerm Memory and Convolutional Neural Networks. In addition, we incorporate\nattention and context matcher mechanisms into the model architecture. Our\nWeather Model achieves significant performance improvements compared to\nbaseline deep learning models, including ConvLSTM, TrajGRU, and U-Net. Our\nexperimental evaluation involves high-scale, real-world benchmark numerical\nweather datasets, namely the ERA5 hourly dataset on pressure levels and\nWeatherBench. Our results demonstrate substantial improvements in identifying\nspatial and temporal correlations with attention matrices focusing on distinct\nparts of the input series to model atmospheric circulations. We also compare\nour model with high-resolution physical models using the benchmark metrics and\nshow that our Weather Model is accurate and easy to interpret.\n","authors":["Selim Furkan Tekin","Arda Fazla","Suleyman Serdar Kozat"],"pdf_url":"https://arxiv.org/pdf/2102.00696v2.pdf","comment":"- In our journal submission, we removed the integration of the\n  observational data section since it was not used in the experiments. Thus, we\n  also removed the authors from the paper who were responsible for that\n  section. - In the second version, we also performed an experiment on\n  WeatherBench. We compare our results with the Physical Weather Forecasting\n  Models"},{"id":"http://arxiv.org/abs/2310.03106v1","updated":"2023-10-04T18:51:25Z","published":"2023-10-04T18:51:25Z","title":"Creating an Atlas of Normal Tissue for Pruning WSI Patching Through\n  Anomaly Detection","summary":"  Patching gigapixel whole slide images (WSIs) is an important task in\ncomputational pathology. Some methods have been proposed to select a subset of\npatches as WSI representation for downstream tasks. While most of the\ncomputational pathology tasks are designed to classify or detect the presence\nof pathological lesions in each WSI, the confounding role and redundant nature\nof normal histology in tissue samples are generally overlooked in WSI\nrepresentations. In this paper, we propose and validate the concept of an\n\"atlas of normal tissue\" solely using samples of WSIs obtained from normal\ntissue biopsies. Such atlases can be employed to eliminate normal fragments of\ntissue samples and hence increase the representativeness collection of patches.\nWe tested our proposed method by establishing a normal atlas using 107 normal\nskin WSIs and demonstrated how established indexes and search engines like\nYottixel can be improved. We used 553 WSIs of cutaneous squamous cell carcinoma\n(cSCC) to show the advantage. We also validated our method applied to an\nexternal dataset of 451 breast WSIs. The number of selected WSI patches was\nreduced by 30% to 50% after utilizing the proposed normal atlas while\nmaintaining the same indexing and search performance in leave-one-patinet-out\nvalidation for both datasets. We show that the proposed normal atlas shows\npromise for unsupervised selection of the most representative patches of the\nabnormal/malignant WSI lesions.\n","authors":["Peyman Nejat","Areej Alsaafin","Ghazal Alabtah","Nneka Comfere","Aaron Mangold","Dennis Murphree","Patricija Zot","Saba Yasir","Joaquin J. Garcia","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2310.03106v1.pdf","comment":"13 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.03091v1","updated":"2023-10-04T18:18:24Z","published":"2023-10-04T18:18:24Z","title":"Privacy-preserving Multi-biometric Indexing based on Frequent Binary\n  Patterns","summary":"  The development of large-scale identification systems that ensure the privacy\nprotection of enrolled subjects represents a major challenge. Biometric\ndeployments that provide interoperability and usability by including efficient\nmulti-biometric solutions are a recent requirement. In the context of privacy\nprotection, several template protection schemes have been proposed in the past.\nHowever, these schemes seem inadequate for indexing (workload reduction) in\nbiometric identification systems. More specifically, they have been used in\nidentification systems that perform exhaustive searches, leading to a\ndegradation of computational efficiency. To overcome these limitations, we\npropose an efficient privacy-preserving multi-biometric identification system\nthat retrieves protected deep cancelable templates and is agnostic with respect\nto biometric characteristics and biometric template protection schemes. To this\nend, a multi-biometric binning scheme is designed to exploit the low\nintra-class variation properties contained in the frequent binary patterns\nextracted from different types of biometric characteristics. Experimental\nresults reported on publicly available databases using state-of-the-art Deep\nNeural Network (DNN)-based embedding extractors show that the protected\nmulti-biometric identification system can reduce the computational workload to\napproximately 57\\% (indexing up to three types of biometric characteristics)\nand 53% (indexing up to two types of biometric characteristics), while\nsimultaneously improving the biometric performance of the baseline biometric\nsystem at the high-security thresholds. The source code of the proposed\nmulti-biometric indexing approach together with the composed multi-biometric\ndataset, will be made available to the research community once the article is\naccepted.\n","authors":["Daile Osorio-Roig","Lazaro J. Gonzalez-Soler","Christian Rathgeb","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2310.03091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v1","updated":"2023-10-04T16:49:36Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":"  The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code will be released at\nhttps://github.com/EvenJoker/Point-PEFT.\n","authors":["Ivan Tang","Eric Zhang","Ray Gu"],"pdf_url":"https://arxiv.org/pdf/2310.03059v1.pdf","comment":"10 pages. The specialized PEFT framework for 3D pre-trained models,\n  which achieves competitive performance to full fine-tuning, and significantly\n  reduces the computational resources. Project page:\n  https://github.com/EvenJoker/Point-PEFT"},{"id":"http://arxiv.org/abs/2310.00031v2","updated":"2023-10-04T14:57:09Z","published":"2023-09-29T05:16:41Z","title":"Text-image Alignment for Diffusion-based Perception","summary":"  Diffusion models are generative models with impressive text-to-image\nsynthesis capabilities and have spurred a new wave of creative methods for\nclassical machine learning tasks. However, the best way to harness the\nperceptual knowledge of these generative models for visual tasks is still an\nopen question. Specifically, it is unclear how to use the prompting interface\nwhen applying diffusion backbones to vision tasks. We find that automatically\ngenerated captions can improve text-image alignment and significantly enhance a\nmodel's cross-attention maps, leading to better perceptual performance. Our\napproach improves upon the current SOTA in diffusion-based semantic\nsegmentation on ADE20K and the current overall SOTA in depth estimation on\nNYUv2. Furthermore, our method generalizes to the cross-domain setting; we use\nmodel personalization and caption modifications to align our model to the\ntarget domain and find improvements over unaligned baselines. Our object\ndetection model, trained on Pascal VOC, achieves SOTA results on Watercolor2K.\nOur segmentation method, trained on Cityscapes, achieves SOTA results on Dark\nZurich-val and Nighttime Driving. Project page:\nhttps://www.vision.caltech.edu/tadp/\n","authors":["Neehar Kondapaneni","Markus Marks","Manuel Knott","Rogério Guimarães","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2310.00031v2.pdf","comment":"Project page: https://www.vision.caltech.edu/tadp/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.03025v1","updated":"2023-10-04T17:59:41Z","published":"2023-10-04T17:59:41Z","title":"Retrieval meets Long Context Large Language Models","summary":"  Extending the context window of large language models (LLMs) is getting\npopular recently, while the solution of augmenting LLMs with retrieval has\nexisted for years. The natural questions are: i) Retrieval-augmentation versus\nlong context window, which one is better for downstream tasks? ii) Can both\nmethods be combined to get the best of both worlds? In this work, we answer\nthese questions by studying both solutions using two state-of-the-art\npretrained LLMs, i.e., a proprietary 43B GPT and LLaMA2-70B. Perhaps\nsurprisingly, we find that LLM with 4K context window using simple\nretrieval-augmentation at generation can achieve comparable performance to\nfinetuned LLM with 16K context window via positional interpolation on long\ncontext tasks, while taking much less computation. More importantly, we\ndemonstrate that retrieval can significantly improve the performance of LLMs\nregardless of their extended context window sizes. Our best model,\nretrieval-augmented LLaMA2-70B with 32K context window, outperforms\nGPT-3.5-turbo-16k and Davinci003 in terms of average score on seven long\ncontext tasks including question answering and query-based summarization. It\nalso outperforms its non-retrieval LLaMA2-70B-32k baseline by a margin, while\nbeing much faster at generation. Our study provides general insights on the\nchoice of retrieval-augmentation versus long context extension of LLM for\npractitioners.\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Lawrence McAfee","Chen Zhu","Zihan Liu","Sandeep Subramanian","Evelina Bakhturina","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02961v1","updated":"2023-10-04T16:54:03Z","published":"2023-10-04T16:54:03Z","title":"Potential Factors Leading to Popularity Unfairness in Recommender\n  Systems: A User-Centered Analysis","summary":"  Popularity bias is a well-known issue in recommender systems where few\npopular items are over-represented in the input data, while majority of other\nless popular items are under-represented. This disparate representation often\nleads to bias in exposure given to the items in the recommendation results.\nExtensive research examined this bias from item perspective and attempted to\nmitigate it by enhancing the recommendation of less popular items. However, a\nrecent research has revealed the impact of this bias on users. Users with\ndifferent degree of tolerance toward popular items are not fairly served by the\nrecommendation system: users interested in less popular items receive more\npopular items in their recommendations, while users interested in popular items\nare recommended what they want. This is mainly due to the popularity bias that\npopular items are over-recommended. In this paper, we aim at investigating the\nfactors leading to this user-side unfairness of popularity bias in recommender\nsystems. In particular, we investigate two factors: 1) the relationship between\nthis unfairness and users' interest toward items' categories (e.g., movie\ngenres), 2) the relationship between this unfairness and the diversity of the\npopularity group in users' profile (the degree to which the user is interested\nin items with different degree of popularity). Experiments on a movie\nrecommendation dataset using multiple recommendation algorithms show that these\ntwo factors are significantly correlated with the degree of popularity\nunfairness in the recommendation results.\n","authors":["Masoud Mansoury","Finn Duijvestijn","Imane Mourabet"],"pdf_url":"https://arxiv.org/pdf/2310.02961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11916v2","updated":"2023-10-04T15:54:30Z","published":"2023-03-21T15:06:35Z","title":"CompoDiff: Versatile Composed Image Retrieval With Latent Diffusion","summary":"  This paper proposes a novel diffusion-based model, CompoDiff, for solving\nComposed Image Retrieval (CIR) with latent diffusion and presents a newly\ncreated dataset, named SynthTriplets18M, of 18 million reference images,\nconditions, and corresponding target image triplets to train the model.\nCompoDiff and SynthTriplets18M tackle the shortages of the previous CIR\napproaches, such as poor generalizability due to the small dataset scale and\nthe limited types of conditions. CompoDiff not only achieves a new zero-shot\nstate-of-the-art on four CIR benchmarks, including FashionIQ, CIRR, CIRCO, and\nGeneCIS, but also enables a more versatile and controllable CIR by accepting\nvarious conditions, such as negative text and image mask conditions, and the\ncontrollability to the importance between multiple queries or the trade-off\nbetween inference speed and the performance which are unavailable with existing\nCIR methods. The code and dataset are available at\nhttps://github.com/navervision/CompoDiff\n","authors":["Geonmo Gu","Sanghyuk Chun","Wonjae Kim","HeeJae Jun","Yoohoon Kang","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2303.11916v2.pdf","comment":"First two authors contributed equally; 26 pages, 4.1MB"},{"id":"http://arxiv.org/abs/2108.05641v2","updated":"2023-10-04T03:21:08Z","published":"2021-08-12T10:12:48Z","title":"SR-HetGNN:Session-based Recommendation with Heterogeneous Graph Neural\n  Network","summary":"  The purpose of the Session-Based Recommendation System is to predict the\nuser's next click according to the previous session sequence. The current\nstudies generally learn user preferences according to the transitions of items\nin the user's session sequence. However, other effective information in the\nsession sequence, such as user profiles, are largely ignored which may lead to\nthe model unable to learn the user's specific preferences. In this paper, we\npropose a heterogeneous graph neural network-based session recommendation\nmethod, named SR-HetGNN, which can learn session embeddings by heterogeneous\ngraph neural network (HetGNN), and capture the specific preferences of\nanonymous users. Specifically, SR-HetGNN first constructs heterogeneous graphs\ncontaining various types of nodes according to the session sequence, which can\ncapture the dependencies among items, users, and sessions. Second, HetGNN\ncaptures the complex transitions between items and learns the item embeddings\ncontaining user information. Finally, to consider the influence of users' long\nand short-term preferences, local and global session embeddings are combined\nwith the attentional network to obtain the final session embedding. SR-HetGNN\nis shown to be superior to the existing state-of-the-art session-based\nrecommendation methods through extensive experiments over two real large\ndatasets Diginetica and Tmall.\n","authors":["Jinpeng Chen","Haiyang Li","Xudong Zhang","Fan Zhang","Senzhang Wang","Kaimin Wei","Jiaqi Ji"],"pdf_url":"https://arxiv.org/pdf/2108.05641v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02540v1","updated":"2023-10-04T02:46:44Z","published":"2023-10-04T02:46:44Z","title":"Auto-FP: An Experimental Study of Automated Feature Preprocessing for\n  Tabular Data","summary":"  Classical machine learning models, such as linear models and tree-based\nmodels, are widely used in industry. These models are sensitive to data\ndistribution, thus feature preprocessing, which transforms features from one\ndistribution to another, is a crucial step to ensure good model quality.\nManually constructing a feature preprocessing pipeline is challenging because\ndata scientists need to make difficult decisions about which preprocessors to\nselect and in which order to compose them. In this paper, we study how to\nautomate feature preprocessing (Auto-FP) for tabular data. Due to the large\nsearch space, a brute-force solution is prohibitively expensive. To address\nthis challenge, we interestingly observe that Auto-FP can be modelled as either\na hyperparameter optimization (HPO) or a neural architecture search (NAS)\nproblem. This observation enables us to extend a variety of HPO and NAS\nalgorithms to solve the Auto-FP problem. We conduct a comprehensive evaluation\nand analysis of 15 algorithms on 45 public ML datasets. Overall,\nevolution-based algorithms show the leading average ranking. Surprisingly, the\nrandom search turns out to be a strong baseline. Many surrogate-model-based and\nbandit-based search algorithms, which achieve good performance for HPO and NAS,\ndo not outperform random search for Auto-FP. We analyze the reasons for our\nfindings and conduct a bottleneck analysis to identify the opportunities to\nimprove these algorithms. Furthermore, we explore how to extend Auto-FP to\nsupport parameter search and compare two ways to achieve this goal. In the end,\nwe evaluate Auto-FP in an AutoML context and discuss the limitations of popular\nAutoML tools. To the best of our knowledge, this is the first study on\nautomated feature preprocessing. We hope our work can inspire researchers to\ndevelop new algorithms tailored for Auto-FP.\n","authors":["Danrui Qi","Jinglin Peng","Yongjun He","Jiannan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00589v2","updated":"2023-10-04T01:43:15Z","published":"2023-07-02T15:11:59Z","title":"MedCPT: Contrastive Pre-trained Transformers with Large-scale PubMed\n  Search Logs for Zero-shot Biomedical Information Retrieval","summary":"  Information retrieval (IR) is essential in biomedical knowledge acquisition\nand clinical decision support. While recent progress has shown that language\nmodel encoders perform better semantic retrieval, training such models requires\nabundant query-article annotations that are difficult to obtain in biomedicine.\nAs a result, most biomedical IR systems only conduct lexical matching. In\nresponse, we introduce MedCPT, a first-of-its-kind Contrastively Pre-trained\nTransformer model for zero-shot semantic IR in biomedicine. To train MedCPT, we\ncollected an unprecedented scale of 255 million user click logs from PubMed.\nWith such data, we use contrastive learning to train a pair of\nclosely-integrated retriever and re-ranker. Experimental results show that\nMedCPT sets new state-of-the-art performance on six biomedical IR tasks,\noutperforming various baselines including much larger models such as\nGPT-3-sized cpt-text-XL. In addition, MedCPT also generates better biomedical\narticle and sentence representations for semantic evaluations. As such, MedCPT\ncan be readily applied to various real-world biomedical IR tasks.\n","authors":["Qiao Jin","Won Kim","Qingyu Chen","Donald C. Comeau","Lana Yeganova","W. John Wilbur","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.00589v2.pdf","comment":"The MedCPT code and API are available at\n  https://github.com/ncbi/MedCPT"},{"id":"http://arxiv.org/abs/2310.02518v1","updated":"2023-10-04T01:33:26Z","published":"2023-10-04T01:33:26Z","title":"Shaping the Epochal Individuality and Generality: The Temporal Dynamics\n  of Uncertainty and Prediction Error in Musical Improvisation","summary":"  Musical improvisation, much like spontaneous speech, reveals intricate facets\nof the improviser's state of mind and emotional character. However, the\nspecific musical components that reveal such individuality remain largely\nunexplored. Within the framework of brain's statistical learning and predictive\nprocessing, this study examined the temporal dynamics of uncertainty and\nsurprise (prediction error) in a piece of musical improvisation. This study\nemployed the HBSL model to analyze a corpus of 456 Jazz improvisations,\nspanning 1905 to 2009, from 78 distinct Jazz musicians. The results indicated\ndistinctive temporal patterns of surprise and uncertainty, especially in pitch\nand pitch-rhythm sequences, revealing era-specific features from the early 20th\nto the 21st centuries. Conversely, rhythm sequences exhibited a consistent\ndegree of uncertainty across eras. Further, the acoustic properties remain\nunchanged across different periods. These findings highlight the importance of\nhow temporal dynamics of surprise and uncertainty in improvisational music\nchange over periods, profoundly influencing the distinctive methodologies\nartists adopt for improvisation in each era. Further, it is suggested that the\ndevelopment of improvisational music can be attributed to the brain's adaptive\nstatistical learning mechanisms, which constantly refine internal models to\nmirror the cultural and emotional nuances of their respective epochs. This\nstudy unravels the evolutionary trajectory of improvisational music and\nhighlights the nuanced shifts artists employ to resonate with the cultural and\nemotional landscapes of their times.\n","authors":["Tatsuya Daikoku"],"pdf_url":"https://arxiv.org/pdf/2310.02518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03851v3","updated":"2023-10-04T00:10:57Z","published":"2022-06-07T16:43:04Z","title":"Reconsidering Learning Objectives in Unbiased Recommendation with\n  Unobserved Confounders","summary":"  This work studies the problem of learning unbiased algorithms from biased\nfeedback for recommendation. We address this problem from a novel distribution\nshift perspective. Recent works in unbiased recommendation have advanced the\nstate-of-the-art with various techniques such as re-weighting, multi-task\nlearning, and meta-learning. Despite their empirical successes, most of them\nlack theoretical guarantees, forming non-negligible gaps between theories and\nrecent algorithms. In this paper, we propose a theoretical understanding of why\nexisting unbiased learning objectives work for unbiased recommendation. We\nestablish a close connection between unbiased recommendation and distribution\nshift, which shows that existing unbiased learning objectives implicitly align\nbiased training and unbiased test distributions. Built upon this connection, we\ndevelop two generalization bounds for existing unbiased learning methods and\nanalyze their learning behavior. Besides, as a result of the distribution\nshift, we further propose a principled framework, Adversarial Self-Training\n(AST), for unbiased recommendation. Extensive experiments on real-world and\nsemi-synthetic datasets demonstrate the effectiveness of AST.\n","authors":["Teng Xiao","Zhengyu Chen","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2206.03851v3.pdf","comment":"KDD2023"},{"id":"http://arxiv.org/abs/2310.03200v1","updated":"2023-10-04T22:59:13Z","published":"2023-10-04T22:59:13Z","title":"Amazon Books Rating prediction & Recommendation Model","summary":"  This paper uses the dataset of Amazon to predict the books ratings listed on\nAmazon website. As part of this project, we predicted the ratings of the books,\nand also built a recommendation cluster. This recommendation cluster provides\nthe recommended books based on the column's values from dataset, for instance,\ncategory, description, author, price, reviews etc. This paper provides a flow\nof handling big data files, data engineering, building models and providing\npredictions. The models predict book ratings column using various PySpark\nMachine Learning APIs. Additionally, we used hyper-parameters and parameters\ntuning. Also, Cross Validation and TrainValidationSplit were used for\ngeneralization. Finally, we performed a comparison between Binary\nClassification and Multiclass Classification in their accuracies. We converted\nour label from multiclass to binary to see if we could find any difference\nbetween the two classifications. As a result, we found out that we get higher\naccuracy in binary classification than in multiclass classification.\n","authors":["Hsiu-Ping Lin","Suman Chauhan","Yougender Chauhan","Nagender Chauhan","Jongwook Woo"],"pdf_url":"https://arxiv.org/pdf/2310.03200v1.pdf","comment":"5 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2310.03175v1","updated":"2023-10-04T21:43:16Z","published":"2023-10-04T21:43:16Z","title":"Impedance Leakage Vulnerability and its Utilization in\n  Reverse-engineering Embedded Software","summary":"  Discovering new vulnerabilities and implementing security and privacy\nmeasures are important to protect systems and data against physical attacks.\nOne such vulnerability is impedance, an inherent property of a device that can\nbe exploited to leak information through an unintended side channel, thereby\nposing significant security and privacy risks. Unlike traditional\nvulnerabilities, impedance is often overlooked or narrowly explored, as it is\ntypically treated as a fixed value at a specific frequency in research and\ndesign endeavors. Moreover, impedance has never been explored as a source of\ninformation leakage. This paper demonstrates that the impedance of an embedded\ndevice is not constant and directly relates to the programs executed on the\ndevice. We define this phenomenon as impedance leakage and use this as a side\nchannel to extract software instructions from protected memory. Our experiment\non the ATmega328P microcontroller and the Artix 7 FPGA indicates that the\nimpedance side channel can detect software instructions with 96.1% and 92.6%\naccuracy, respectively. Furthermore, we explore the dual nature of the\nimpedance side channel, highlighting the potential for beneficial purposes and\nthe associated risk of intellectual property theft. Finally, potential\ncountermeasures that specifically address impedance leakage are discussed.\n","authors":["Md Sadik Awal","Md Tauhidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2310.03175v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.03026v1","updated":"2023-10-04T17:59:49Z","published":"2023-10-04T17:59:49Z","title":"LanguageMPC: Large Language Models as Decision Makers for Autonomous\n  Driving","summary":"  Existing learning-based autonomous driving (AD) systems face challenges in\ncomprehending high-level information, generalizing to rare events, and\nproviding interpretability. To address these problems, this work employs Large\nLanguage Models (LLMs) as a decision-making component for complex AD scenarios\nthat require human commonsense understanding. We devise cognitive pathways to\nenable comprehensive reasoning with LLMs, and develop algorithms for\ntranslating LLM decisions into actionable driving commands. Through this\napproach, LLM decisions are seamlessly integrated with low-level controllers by\nguided parameter matrix adaptation. Extensive experiments demonstrate that our\nproposed method not only consistently surpasses baseline approaches in\nsingle-vehicle tasks, but also helps handle complex driving behaviors even\nmulti-vehicle coordination, thanks to the commonsense reasoning capabilities of\nLLMs. This paper presents an initial step toward leveraging LLMs as effective\ndecision-makers for intricate AD scenarios in terms of safety, efficiency,\ngeneralizability, and interoperability. We aspire for it to serve as\ninspiration for future research in this field. Project page:\nhttps://sites.google.com/view/llm-mpc\n","authors":["Hao Sha","Yao Mu","Yuxuan Jiang","Li Chen","Chenfeng Xu","Ping Luo","Shengbo Eben Li","Masayoshi Tomizuka","Wei Zhan","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2310.03026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03025v1","updated":"2023-10-04T17:59:41Z","published":"2023-10-04T17:59:41Z","title":"Retrieval meets Long Context Large Language Models","summary":"  Extending the context window of large language models (LLMs) is getting\npopular recently, while the solution of augmenting LLMs with retrieval has\nexisted for years. The natural questions are: i) Retrieval-augmentation versus\nlong context window, which one is better for downstream tasks? ii) Can both\nmethods be combined to get the best of both worlds? In this work, we answer\nthese questions by studying both solutions using two state-of-the-art\npretrained LLMs, i.e., a proprietary 43B GPT and LLaMA2-70B. Perhaps\nsurprisingly, we find that LLM with 4K context window using simple\nretrieval-augmentation at generation can achieve comparable performance to\nfinetuned LLM with 16K context window via positional interpolation on long\ncontext tasks, while taking much less computation. More importantly, we\ndemonstrate that retrieval can significantly improve the performance of LLMs\nregardless of their extended context window sizes. Our best model,\nretrieval-augmented LLaMA2-70B with 32K context window, outperforms\nGPT-3.5-turbo-16k and Davinci003 in terms of average score on seven long\ncontext tasks including question answering and query-based summarization. It\nalso outperforms its non-retrieval LLaMA2-70B-32k baseline by a margin, while\nbeing much faster at generation. Our study provides general insights on the\nchoice of retrieval-augmentation versus long context extension of LLM for\npractitioners.\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Lawrence McAfee","Chen Zhu","Zihan Liu","Sandeep Subramanian","Evelina Bakhturina","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03023v1","updated":"2023-10-04T17:59:38Z","published":"2023-10-04T17:59:38Z","title":"Human-oriented Representation Learning for Robotic Manipulation","summary":"  Humans inherently possess generalizable visual representations that empower\nthem to efficiently explore and interact with the environments in manipulation\ntasks. We advocate that such a representation automatically arises from\nsimultaneously learning about multiple simple perceptual skills that are\ncritical for everyday scenarios (e.g., hand detection, state estimate, etc.)\nand is better suited for learning robot manipulation policies compared to\ncurrent state-of-the-art visual representations purely based on self-supervised\nobjectives. We formalize this idea through the lens of human-oriented\nmulti-task fine-tuning on top of pre-trained visual encoders, where each task\nis a perceptual skill tied to human-environment interactions. We introduce Task\nFusion Decoder as a plug-and-play embedding translator that utilizes the\nunderlying relationships among these perceptual skills to guide the\nrepresentation learning towards encoding meaningful structure for what's\nimportant for all perceptual skills, ultimately empowering learning of\ndownstream robotic manipulation tasks. Extensive experiments across a range of\nrobotic tasks and embodiments, in both simulations and real-world environments,\nshow that our Task Fusion Decoder consistently improves the representation of\nthree state-of-the-art visual encoders including R3M, MVP, and EgoVLP, for\ndownstream manipulation policy-learning. Project page:\nhttps://sites.google.com/view/human-oriented-robot-learning\n","authors":["Mingxiao Huo","Mingyu Ding","Chenfeng Xu","Thomas Tian","Xinghao Zhu","Yao Mu","Lingfeng Sun","Masayoshi Tomizuka","Wei Zhan"],"pdf_url":"https://arxiv.org/pdf/2310.03023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03024v1","updated":"2023-10-04T17:59:38Z","published":"2023-10-04T17:59:38Z","title":"AstroCLIP: Cross-Modal Pre-Training for Astronomical Foundation Models","summary":"  We present AstroCLIP, a strategy to facilitate the construction of\nastronomical foundation models that bridge the gap between diverse\nobservational modalities. We demonstrate that a cross-modal contrastive\nlearning approach between images and optical spectra of galaxies yields highly\ninformative embeddings of both modalities. In particular, we apply our method\non multi-band images and optical spectra from the Dark Energy Spectroscopic\nInstrument (DESI), and show that: (1) these embeddings are well-aligned between\nmodalities and can be used for accurate cross-modal searches, and (2) these\nembeddings encode valuable physical information about the galaxies -- in\nparticular redshift and stellar mass -- that can be used to achieve competitive\nzero- and few- shot predictions without further finetuning. Additionally, in\nthe process of developing our approach, we also construct a novel,\ntransformer-based model and pretraining approach for processing galaxy spectra.\n","authors":["Francois Lanusse","Liam Parker","Siavash Golkar","Miles Cranmer","Alberto Bietti","Michael Eickenberg","Geraud Krawezik","Michael McCabe","Ruben Ohana","Mariel Pettee","Bruno Regaldo-Saint Blancard","Tiberiu Tesileanu","Kyunghyun Cho","Shirley Ho"],"pdf_url":"https://arxiv.org/pdf/2310.03024v1.pdf","comment":"Submitted to the NeurIPS 2023 AI4Science Workshop"},{"id":"http://arxiv.org/abs/2310.03022v1","updated":"2023-10-04T17:59:32Z","published":"2023-10-04T17:59:32Z","title":"Decision ConvFormer: Local Filtering in MetaFormer is Sufficient for\n  Decision Making","summary":"  The recent success of Transformer in natural language processing has sparked\nits use in various domains. In offline reinforcement learning (RL), Decision\nTransformer (DT) is emerging as a promising model based on Transformer.\nHowever, we discovered that the attention module of DT is not appropriate to\ncapture the inherent local dependence pattern in trajectories of RL modeled as\na Markov decision process. To overcome the limitations of DT, we propose a\nnovel action sequence predictor, named Decision ConvFormer (DC), based on the\narchitecture of MetaFormer, which is a general structure to process multiple\nentities in parallel and understand the interrelationship among the multiple\nentities. DC employs local convolution filtering as the token mixer and can\neffectively capture the inherent local associations of the RL dataset. In\nextensive experiments, DC achieved state-of-the-art performance across various\nstandard RL benchmarks while requiring fewer resources. Furthermore, we show\nthat DC better understands the underlying meaning in data and exhibits enhanced\ngeneralization capability.\n","authors":["Jeonghye Kim","Suyoung Lee","Woojun Kim","Youngchul Sung"],"pdf_url":"https://arxiv.org/pdf/2310.03022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06651v2","updated":"2023-10-04T17:57:35Z","published":"2023-09-13T00:30:32Z","title":"ConR: Contrastive Regularizer for Deep Imbalanced Regression","summary":"  Imbalanced distributions are ubiquitous in real-world data. They create\nconstraints on Deep Neural Networks to represent the minority labels and avoid\nbias towards majority labels. The extensive body of imbalanced approaches\naddress categorical label spaces but fail to effectively extend to regression\nproblems where the label space is continuous. Local and global correlations\namong continuous labels provide valuable insights towards effectively modelling\nrelationships in feature space. In this work, we propose ConR, a contrastive\nregularizer that models global and local label similarities in feature space\nand prevents the features of minority samples from being collapsed into their\nmajority neighbours. ConR discerns the disagreements between the label space\nand feature space and imposes a penalty on these disagreements. ConR addresses\nthe continuous nature of label space with two main strategies in a contrastive\nmanner: incorrect proximities are penalized proportionate to the label\nsimilarities and the correct ones are encouraged to model local similarities.\nConR consolidates essential considerations into a generic, easy-to-integrate,\nand efficient method that effectively addresses deep imbalanced regression.\nMoreover, ConR is orthogonal to existing approaches and smoothly extends to\nuni- and multi-dimensional label spaces. Our comprehensive experiments show\nthat ConR significantly boosts the performance of all the state-of-the-art\nmethods on four large-scale deep imbalanced regression benchmarks. Our code is\npublicly available in https://github.com/BorealisAI/ConR.\n","authors":["Mahsa Keramati","Lili Meng","R. David Evans"],"pdf_url":"https://arxiv.org/pdf/2309.06651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03016v1","updated":"2023-10-04T17:57:33Z","published":"2023-10-04T17:57:33Z","title":"Understanding In-Context Learning in Transformers and LLMs by Learning\n  to Learn Discrete Functions","summary":"  In order to understand the in-context learning phenomenon, recent works have\nadopted a stylized experimental framework and demonstrated that Transformers\ncan learn gradient-based learning algorithms for various classes of real-valued\nfunctions. However, the limitations of Transformers in implementing learning\nalgorithms, and their ability to learn other forms of algorithms are not well\nunderstood. Additionally, the degree to which these capabilities are confined\nto attention-based models is unclear. Furthermore, it remains to be seen\nwhether the insights derived from these stylized settings can be extrapolated\nto pretrained Large Language Models (LLMs). In this work, we take a step\ntowards answering these questions by demonstrating the following: (a) On a\ntest-bed with a variety of Boolean function classes, we find that Transformers\ncan nearly match the optimal learning algorithm for 'simpler' tasks, while\ntheir performance deteriorates on more 'complex' tasks. Additionally, we find\nthat certain attention-free models perform (almost) identically to Transformers\non a range of tasks. (b) When provided a teaching sequence, i.e. a set of\nexamples that uniquely identifies a function in a class, we show that\nTransformers learn more sample-efficiently. Interestingly, our results show\nthat Transformers can learn to implement two distinct algorithms to solve a\nsingle task, and can adaptively select the more sample-efficient algorithm\ndepending on the sequence of in-context examples. (c) Lastly, we show that\nextant LLMs, e.g. LLaMA-2, GPT-4, can compete with nearest-neighbor baselines\non prediction tasks that are guaranteed to not be in their training set.\n","authors":["Satwik Bhattamishra","Arkil Patel","Phil Blunsom","Varun Kanade"],"pdf_url":"https://arxiv.org/pdf/2310.03016v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.03013v1","updated":"2023-10-04T17:56:41Z","published":"2023-10-04T17:56:41Z","title":"SemiReward: A General Reward Model for Semi-supervised Learning","summary":"  Semi-supervised learning (SSL) has witnessed great progress with various\nimprovements in the self-training framework with pseudo labeling. The main\nchallenge is how to distinguish high-quality pseudo labels against the\nconfirmation bias. However, existing pseudo-label selection strategies are\nlimited to pre-defined schemes or complex hand-crafted policies specially\ndesigned for classification, failing to achieve high-quality labels, fast\nconvergence, and task versatility simultaneously. To these ends, we propose a\nSemi-supervised Reward framework (SemiReward) that predicts reward scores to\nevaluate and filter out high-quality pseudo labels, which is pluggable to\nmainstream SSL methods in wide task types and scenarios. To mitigate\nconfirmation bias, SemiReward is trained online in two stages with a generator\nmodel and subsampling strategy. With classification and regression tasks on 13\nstandard SSL benchmarks of three modalities, extensive experiments verify that\nSemiReward achieves significant performance gains and faster convergence speeds\nupon Pseudo Label, FlexMatch, and Free/SoftMatch.\n","authors":["Siyuan Li","Weiyang Jin","Zedong Wang","Fang Wu","Zicheng Liu","Cheng Tan","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2310.03013v1.pdf","comment":"Preprint of 22 pages with the source code at\n  \\url{https://github.com/Westlake-AI/SemiReward}"},{"id":"http://arxiv.org/abs/2309.10657v2","updated":"2023-10-04T17:55:01Z","published":"2023-09-19T14:39:39Z","title":"Learning Adaptive Safety for Multi-Agent Systems","summary":"  Ensuring safety in dynamic multi-agent systems is challenging due to limited\ninformation about the other agents. Control Barrier Functions (CBFs) are\nshowing promise for safety assurance but current methods make strong\nassumptions about other agents and often rely on manual tuning to balance\nsafety, feasibility, and performance. In this work, we delve into the problem\nof adaptive safe learning for multi-agent systems with CBF. We show how\nemergent behavior can be profoundly influenced by the CBF configuration,\nhighlighting the necessity for a responsive and dynamic approach to CBF design.\nWe present ASRL, a novel adaptive safe RL framework, to fully automate the\noptimization of policy and CBF coefficients, to enhance safety and long-term\nperformance through reinforcement learning. By directly interacting with the\nother agents, ASRL learns to cope with diverse agent behaviours and maintains\nthe cost violations below a desired limit. We evaluate ASRL in a multi-robot\nsystem and a competitive multi-agent racing scenario, against learning-based\nand control-theoretic approaches. We empirically demonstrate the efficacy and\nflexibility of ASRL, and assess generalization and scalability to\nout-of-distribution scenarios. Code and supplementary material are public\nonline.\n","authors":["Luigi Berducci","Shuo Yang","Rahul Mangharam","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2309.10657v2.pdf","comment":"Update with appendix"},{"id":"http://arxiv.org/abs/2310.03010v1","updated":"2023-10-04T17:53:53Z","published":"2023-10-04T17:53:53Z","title":"High-dimensional SGD aligns with emerging outlier eigenspaces","summary":"  We rigorously study the joint evolution of training dynamics via stochastic\ngradient descent (SGD) and the spectra of empirical Hessian and gradient\nmatrices. We prove that in two canonical classification tasks for multi-class\nhigh-dimensional mixtures and either 1 or 2-layer neural networks, the SGD\ntrajectory rapidly aligns with emerging low-rank outlier eigenspaces of the\nHessian and gradient matrices. Moreover, in multi-layer settings this alignment\noccurs per layer, with the final layer's outlier eigenspace evolving over the\ncourse of training, and exhibiting rank deficiency when the SGD converges to\nsub-optimal classifiers. This establishes some of the rich predictions that\nhave arisen from extensive numerical studies in the last decade about the\nspectra of Hessian and information matrices over the course of training in\noverparametrized networks.\n","authors":["Gerard Ben Arous","Reza Gheissari","Jiaoyang Huang","Aukosh Jagannath"],"pdf_url":"https://arxiv.org/pdf/2310.03010v1.pdf","comment":"52 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.03004v1","updated":"2023-10-04T17:45:14Z","published":"2023-10-04T17:45:14Z","title":"Soft Convex Quantization: Revisiting Vector Quantization with Convex\n  Optimization","summary":"  Vector Quantization (VQ) is a well-known technique in deep learning for\nextracting informative discrete latent representations. VQ-embedded models have\nshown impressive results in a range of applications including image and speech\ngeneration. VQ operates as a parametric K-means algorithm that quantizes inputs\nusing a single codebook vector in the forward pass. While powerful, this\ntechnique faces practical challenges including codebook collapse,\nnon-differentiability and lossy compression. To mitigate the aforementioned\nissues, we propose Soft Convex Quantization (SCQ) as a direct substitute for\nVQ. SCQ works like a differentiable convex optimization (DCO) layer: in the\nforward pass, we solve for the optimal convex combination of codebook vectors\nthat quantize the inputs. In the backward pass, we leverage differentiability\nthrough the optimality conditions of the forward solution. We then introduce a\nscalable relaxation of the SCQ optimization and demonstrate its efficacy on the\nCIFAR-10, GTSRB and LSUN datasets. We train powerful SCQ autoencoder models\nthat significantly outperform matched VQ-based architectures, observing an\norder of magnitude better image reconstruction and codebook usage with\ncomparable quantization runtime.\n","authors":["Tanmay Gautam","Reid Pryzant","Ziyi Yang","Chenguang Zhu","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2310.03004v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.14585v2","updated":"2023-10-04T17:44:18Z","published":"2023-05-23T23:51:53Z","title":"Faithful and Efficient Explanations for Neural Networks via Neural\n  Tangent Kernel Surrogate Models","summary":"  A recent trend in explainable AI research has focused on surrogate modeling,\nwhere neural networks are approximated as simpler ML algorithms such as kernel\nmachines. A second trend has been to utilize kernel functions in various\nexplain-by-example or data attribution tasks to investigate a diverse set of\nneural network behavior. In this work, we combine these two trends to analyze\napproximate empirical neural tangent kernels (eNTK) for data attribution.\nApproximation is critical for eNTK analysis due to the high computational cost\nto compute the eNTK. We define new approximate eNTK and perform novel analysis\non how well the resulting kernel machine surrogate models correlate with the\nunderlying neural network. We introduce two new random projection variants of\napproximate eNTK which allow users to tune the time and memory complexity of\ntheir calculation. We conclude that kernel machines using approximate neural\ntangent kernel as the kernel function are effective surrogate models, with the\nintroduced trace NTK the most consistent performer.\n","authors":["Andrew Engel","Zhichao Wang","Natalie S. Frank","Ioana Dumitriu","Sutanay Choudhury","Anand Sarwate","Tony Chiang"],"pdf_url":"https://arxiv.org/pdf/2305.14585v2.pdf","comment":"Updated 10/4/2023: significant changes for ICLR2023 submission.\n  Github repository will be live soon. 9 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.03001v1","updated":"2023-10-04T17:40:46Z","published":"2023-10-04T17:40:46Z","title":"Learning characteristic parameters and dynamics of centrifugal pumps\n  under multi-phase flow using physics-informed neural networks","summary":"  Electrical submersible pumps (ESP) are the second most used artificial\nlifting equipment in the oil and gas industry due to their high flow rates and\nboost pressures. They often have to handle multiphase flows, which usually\ncontain a mixture of hydrocarbons, water, and/or sediments. Given these\ncircumstances, emulsions are commonly formed. It is a liquid-liquid flow\ncomposed of two immiscible fluids whose effective viscosity and density differ\nfrom the single phase separately. In this context, accurate modeling of ESP\nsystems is crucial for optimizing oil production and implementing control\nstrategies. However, real-time and direct measurement of fluid and system\ncharacteristics is often impractical due to time constraints and economy.\nHence, indirect methods are generally considered to estimate the system\nparameters. In this paper, we formulate a machine learning model based on\nPhysics-Informed Neural Networks (PINNs) to estimate crucial system parameters.\nIn order to study the efficacy of the proposed PINN model, we conduct\ncomputational studies using not only simulated but also experimental data for\ndifferent water-oil ratios. We evaluate the state variable's dynamics and\nunknown parameters for various combinations when only intake and discharge\npressure measurements are available. We also study structural and practical\nidentifiability analyses based on commonly available pressure measurements. The\nPINN model could reduce the requirement of expensive field laboratory tests\nused to estimate fluid properties.\n","authors":["Felipe de Castro Teixeira Carvalho","Kamaljyoti Nath","Alberto Luiz Serpa","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2310.03001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02998v1","updated":"2023-10-04T17:34:00Z","published":"2023-10-04T17:34:00Z","title":"ECoFLaP: Efficient Coarse-to-Fine Layer-Wise Pruning for Vision-Language\n  Models","summary":"  Large Vision-Language Models (LVLMs) can understand the world comprehensively\nby integrating rich information from different modalities, achieving remarkable\nperformance improvements on various multimodal downstream tasks. However,\ndeploying LVLMs is often problematic due to their massive computational/energy\ncosts and carbon consumption. Such issues make it infeasible to adopt\nconventional iterative global pruning, which is costly due to computing the\nHessian matrix of the entire large model for sparsification. Alternatively,\nseveral studies have recently proposed layer-wise pruning approaches to avoid\nthe expensive computation of global pruning and efficiently compress model\nweights according to their importance within a layer. However, these methods\noften suffer from suboptimal model compression due to their lack of a global\nperspective. To address this limitation in recent efficient pruning methods for\nlarge models, we propose Efficient Coarse-to-Fine Layer-Wise Pruning (ECoFLaP),\na two-stage coarse-to-fine weight pruning approach for LVLMs. We first\ndetermine the sparsity ratios of different layers or blocks by leveraging the\nglobal importance score, which is efficiently computed based on the\nzeroth-order approximation of the global model gradients. Then, the multimodal\nmodel performs local layer-wise unstructured weight pruning based on\nglobally-informed sparsity ratios. We validate our proposed method across\nvarious multimodal and unimodal models and datasets, demonstrating significant\nperformance improvements over prevalent pruning techniques in the high-sparsity\nregime.\n","authors":["Yi-Lin Sung","Jaehong Yoon","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.02998v1.pdf","comment":"Project page: https://ecoflap.github.io/"},{"id":"http://arxiv.org/abs/2310.02995v1","updated":"2023-10-04T17:30:50Z","published":"2023-10-04T17:30:50Z","title":"IBCL: Zero-shot Model Generation for Task Trade-offs in Continual\n  Learning","summary":"  Like generic multi-task learning, continual learning has the nature of\nmulti-objective optimization, and therefore faces a trade-off between the\nperformance of different tasks. That is, to optimize for the current task\ndistribution, it may need to compromise performance on some previous tasks.\nThis means that there exist multiple models that are Pareto-optimal at\ndifferent times, each addressing a distinct task performance trade-off.\nResearchers have discussed how to train particular models to address specific\ntrade-off preferences. However, existing algorithms require training overheads\nproportional to the number of preferences -- a large burden when there are\nmultiple, possibly infinitely many, preferences. As a response, we propose\nImprecise Bayesian Continual Learning (IBCL). Upon a new task, IBCL (1) updates\na knowledge base in the form of a convex hull of model parameter distributions\nand (2) obtains particular models to address task trade-off preferences with\nzero-shot. That is, IBCL does not require any additional training overhead to\ngenerate preference-addressing models from its knowledge base. We show that\nmodels obtained by IBCL have guarantees in identifying the Pareto optimal\nparameters. Moreover, experiments on standard image classification and NLP\ntasks support this guarantee. Statistically, IBCL improves average per-task\naccuracy by at most 23\\% and peak per-task accuracy by at most 15\\% with\nrespect to the baseline methods, with steadily near-zero or positive backward\ntransfer. Most importantly, IBCL significantly reduces the training overhead\nfrom training 1 model per preference to at most 3 models for all preferences.\n","authors":["Pengyuan Lu","Michele Caprio","Eric Eaton","Insup Lee"],"pdf_url":"https://arxiv.org/pdf/2310.02995v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2305.14782"},{"id":"http://arxiv.org/abs/2310.02994v1","updated":"2023-10-04T17:29:19Z","published":"2023-10-04T17:29:19Z","title":"Multiple Physics Pretraining for Physical Surrogate Models","summary":"  We introduce multiple physics pretraining (MPP), an autoregressive\ntask-agnostic pretraining approach for physical surrogate modeling. MPP\ninvolves training large surrogate models to predict the dynamics of multiple\nheterogeneous physical systems simultaneously by learning features that are\nbroadly useful across diverse physical tasks. In order to learn effectively in\nthis setting, we introduce a shared embedding and normalization strategy that\nprojects the fields of multiple systems into a single shared embedding space.\nWe validate the efficacy of our approach on both pretraining and downstream\ntasks over a broad fluid mechanics-oriented benchmark. We show that a single\nMPP-pretrained transformer is able to match or outperform task-specific\nbaselines on all pretraining sub-tasks without the need for finetuning. For\ndownstream tasks, we demonstrate that finetuning MPP-trained models results in\nmore accurate predictions across multiple time-steps on new physics compared to\ntraining from scratch or finetuning pretrained video foundation models. We\nopen-source our code and model weights trained at multiple scales for\nreproducibility and community experimentation.\n","authors":["Michael McCabe","Bruno Régaldo-Saint Blancard","Liam Holden Parker","Ruben Ohana","Miles Cranmer","Alberto Bietti","Michael Eickenberg","Siavash Golkar","Geraud Krawezik","Francois Lanusse","Mariel Pettee","Tiberiu Tesileanu","Kyunghyun Cho","Shirley Ho"],"pdf_url":"https://arxiv.org/pdf/2310.02994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02989v1","updated":"2023-10-04T17:26:16Z","published":"2023-10-04T17:26:16Z","title":"xVal: A Continuous Number Encoding for Large Language Models","summary":"  Large Language Models have not yet been broadly adapted for the analysis of\nscientific datasets due in part to the unique difficulties of tokenizing\nnumbers. We propose xVal, a numerical encoding scheme that represents any real\nnumber using just a single token. xVal represents a given real number by\nscaling a dedicated embedding vector by the number value. Combined with a\nmodified number-inference approach, this strategy renders the model end-to-end\ncontinuous when considered as a map from the numbers of the input string to\nthose of the output string. This leads to an inductive bias that is generally\nmore suitable for applications in scientific domains. We empirically evaluate\nour proposal on a number of synthetic and real-world datasets. Compared with\nexisting number encoding schemes, we find that xVal is more token-efficient and\ndemonstrates improved generalization.\n","authors":["Siavash Golkar","Mariel Pettee","Michael Eickenberg","Alberto Bietti","Miles Cranmer","Geraud Krawezik","Francois Lanusse","Michael McCabe","Ruben Ohana","Liam Parker","Bruno Régaldo-Saint Blancard","Tiberiu Tesileanu","Kyunghyun Cho","Shirley Ho"],"pdf_url":"https://arxiv.org/pdf/2310.02989v1.pdf","comment":"10 pages 7 figures. Supplementary: 5 pages 2 figures"},{"id":"http://arxiv.org/abs/2310.02987v1","updated":"2023-10-04T17:24:45Z","published":"2023-10-04T17:24:45Z","title":"Variance Reduced Halpern Iteration for Finite-Sum Monotone Inclusions","summary":"  Machine learning approaches relying on such criteria as adversarial\nrobustness or multi-agent settings have raised the need for solving\ngame-theoretic equilibrium problems. Of particular relevance to these\napplications are methods targeting finite-sum structure, which generically\narises in empirical variants of learning problems in these contexts. Further,\nmethods with computable approximation errors are highly desirable, as they\nprovide verifiable exit criteria. Motivated by these applications, we study\nfinite-sum monotone inclusion problems, which model broad classes of\nequilibrium problems. Our main contributions are variants of the classical\nHalpern iteration that employ variance reduction to obtain improved complexity\nguarantees in which $n$ component operators in the finite sum are ``on\naverage'' either cocoercive or Lipschitz continuous and monotone, with\nparameter $L$. The resulting oracle complexity of our methods, which provide\nguarantees for the last iterate and for a (computable) operator norm residual,\nis $\\widetilde{\\mathcal{O}}( n + \\sqrt{n}L\\varepsilon^{-1})$, which improves\nupon existing methods by a factor up to $\\sqrt{n}$. This constitutes the first\nvariance reduction-type result for general finite-sum monotone inclusions and\nfor more specific problems such as convex-concave optimization when operator\nnorm residual is the optimality measure. We further argue that, up to\npoly-logarithmic factors, this complexity is unimprovable in the monotone\nLipschitz setting; i.e., the provided result is near-optimal.\n","authors":["Xufeng Cai","Ahmet Alacaoglu","Jelena Diakonikolas"],"pdf_url":"https://arxiv.org/pdf/2310.02987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02986v1","updated":"2023-10-04T17:24:38Z","published":"2023-10-04T17:24:38Z","title":"Exploring the Impact of Disrupted Peer-to-Peer Communications on Fully\n  Decentralized Learning in Disaster Scenarios","summary":"  Fully decentralized learning enables the distribution of learning resources\nand decision-making capabilities across multiple user devices or nodes, and is\nrapidly gaining popularity due to its privacy-preserving and decentralized\nnature. Importantly, this crowdsourcing of the learning process allows the\nsystem to continue functioning even if some nodes are affected or disconnected.\nIn a disaster scenario, communication infrastructure and centralized systems\nmay be disrupted or completely unavailable, hindering the possibility of\ncarrying out standard centralized learning tasks in these settings. Thus, fully\ndecentralized learning can help in this case. However, transitioning from\ncentralized to peer-to-peer communications introduces a dependency between the\nlearning process and the topology of the communication graph among nodes. In a\ndisaster scenario, even peer-to-peer communications are susceptible to abrupt\nchanges, such as devices running out of battery or getting disconnected from\nothers due to their position. In this study, we investigate the effects of\nvarious disruptions to peer-to-peer communications on decentralized learning in\na disaster setting. We examine the resilience of a decentralized learning\nprocess when a subset of devices drop from the process abruptly. To this end,\nwe analyze the difference between losing devices holding data, i.e., potential\nknowledge, vs. devices contributing only to the graph connectivity, i.e., with\nno data. Our findings on a Barabasi-Albert graph topology, where training data\nis distributed across nodes in an IID fashion, indicate that the accuracy of\nthe learning process is more affected by a loss of connectivity than by a loss\nof data. Nevertheless, the network remains relatively robust, and the learning\nprocess can achieve a good level of accuracy.\n","authors":["Luigi Palmieri","Chiara Boldrini","Lorenzo Valerio","Andrea Passarella","Marco Conti"],"pdf_url":"https://arxiv.org/pdf/2310.02986v1.pdf","comment":"Accepted at IEEE ICT-DM 2023"},{"id":"http://arxiv.org/abs/2310.02984v1","updated":"2023-10-04T17:20:34Z","published":"2023-10-04T17:20:34Z","title":"Scaling Laws for Associative Memories","summary":"  Learning arguably involves the discovery and memorization of abstract rules.\nThe aim of this paper is to study associative memory mechanisms. Our model is\nbased on high-dimensional matrices consisting of outer products of embeddings,\nwhich relates to the inner layers of transformer language models. We derive\nprecise scaling laws with respect to sample size and parameter size, and\ndiscuss the statistical efficiency of different estimators, including\noptimization-based algorithms. We provide extensive numerical experiments to\nvalidate and interpret theoretical results, including fine-grained\nvisualizations of the stored memory associations.\n","authors":["Vivien Cabannes","Elvis Dohmatob","Alberto Bietti"],"pdf_url":"https://arxiv.org/pdf/2310.02984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02980v1","updated":"2023-10-04T17:17:06Z","published":"2023-10-04T17:17:06Z","title":"Never Train from Scratch: Fair Comparison of Long-Sequence Models\n  Requires Data-Driven Priors","summary":"  Modeling long-range dependencies across sequences is a longstanding goal in\nmachine learning and has led to architectures, such as state space models, that\ndramatically outperform Transformers on long sequences. However, these\nimpressive empirical gains have been by and large demonstrated on benchmarks\n(e.g. Long Range Arena), where models are randomly initialized and trained to\npredict a target label from an input sequence. In this work, we show that\nrandom initialization leads to gross overestimation of the differences between\narchitectures and that pretraining with standard denoising objectives, using\n$\\textit{only the downstream task data}$, leads to dramatic gains across\nmultiple architectures and to very small gaps between Transformers and state\nspace models (SSMs). In stark contrast to prior works, we find vanilla\nTransformers to match the performance of S4 on Long Range Arena when properly\npretrained, and we improve the best reported results of SSMs on the PathX-256\ntask by 20 absolute points. Subsequently, we analyze the utility of\npreviously-proposed structured parameterizations for SSMs and show they become\nmostly redundant in the presence of data-driven initialization obtained through\npretraining. Our work shows that, when evaluating different architectures on\nsupervised tasks, incorporation of data-driven priors via pretraining is\nessential for reliable performance estimation, and can be done efficiently.\n","authors":["Ido Amos","Jonathan Berant","Ankit Gupta"],"pdf_url":"https://arxiv.org/pdf/2310.02980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02977v1","updated":"2023-10-04T17:12:18Z","published":"2023-10-04T17:12:18Z","title":"T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation","summary":"  Recent methods in text-to-3D leverage powerful pretrained diffusion models to\noptimize NeRF. Notably, these methods are able to produce high-quality 3D\nscenes without training on 3D data. Due to the open-ended nature of the task,\nmost studies evaluate their results with subjective case studies and user\nexperiments, thereby presenting a challenge in quantitatively addressing the\nquestion: How has current progress in Text-to-3D gone so far? In this paper, we\nintroduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing\ndiverse text prompts of three increasing complexity levels that are specially\ndesigned for 3D generation. To assess both the subjective quality and the text\nalignment, we propose two automatic metrics based on multi-view images produced\nby the 3D contents. The quality metric combines multi-view text-image scores\nand regional convolution to detect quality and view inconsistency. The\nalignment metric uses multi-view captioning and Large Language Model (LLM)\nevaluation to measure text-3D consistency. Both metrics closely correlate with\ndifferent dimensions of human judgments, providing a paradigm for efficiently\nevaluating text-to-3D models. The benchmarking results, shown in Fig. 1, reveal\nperformance differences among six prevalent text-to-3D methods. Our analysis\nfurther highlights the common struggles for current methods on generating\nsurroundings and multi-object scenes, as well as the bottleneck of leveraging\n2D guidance for 3D generation. Our project page is available at:\nhttps://t3bench.com.\n","authors":["Yuze He","Yushi Bai","Matthieu Lin","Wang Zhao","Yubin Hu","Jenny Sheng","Ran Yi","Juanzi Li","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02977v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.02975v1","updated":"2023-10-04T17:11:15Z","published":"2023-10-04T17:11:15Z","title":"Towards Fully Adaptive Regret Minimization in Heavy-Tailed Bandits","summary":"  Heavy-tailed distributions naturally arise in many settings, from finance to\ntelecommunications. While regret minimization under sub-Gaussian or bounded\nsupport rewards has been widely studied, learning on heavy-tailed distributions\nonly gained popularity over the last decade. In the stochastic heavy-tailed\nbandit problem, an agent learns under the assumption that the distributions\nhave finite moments of maximum order $1+\\epsilon$ which are uniformly bounded\nby a constant $u$, for some $\\epsilon \\in (0,1]$. To the best of our knowledge,\nliterature only provides algorithms requiring these two quantities as an input.\nIn this paper, we study the stochastic adaptive heavy-tailed bandit, a\nvariation of the standard setting where both $\\epsilon$ and $u$ are unknown to\nthe agent. We show that adaptivity comes at a cost, introducing two lower\nbounds on the regret of any adaptive algorithm, implying a higher regret w.r.t.\nthe standard setting. Finally, we introduce a specific distributional\nassumption and provide Adaptive Robust UCB, a regret minimization strategy\nmatching the known lower bound for the heavy-tailed MAB problem.\n","authors":["Gianmarco Genalti","Lupo Marsigli","Nicola Gatti","Alberto Maria Metelli"],"pdf_url":"https://arxiv.org/pdf/2310.02975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11896v3","updated":"2023-10-04T17:10:46Z","published":"2022-11-21T22:51:16Z","title":"Private Ad Modeling with DP-SGD","summary":"  A well-known algorithm in privacy-preserving ML is differentially private\nstochastic gradient descent (DP-SGD). While this algorithm has been evaluated\non text and image data, it has not been previously applied to ads data, which\nare notorious for their high class imbalance and sparse gradient updates. In\nthis work we apply DP-SGD to several ad modeling tasks including predicting\nclick-through rates, conversion rates, and number of conversion events, and\nevaluate their privacy-utility trade-off on real-world datasets. Our work is\nthe first to empirically demonstrate that DP-SGD can provide both privacy and\nutility for ad modeling tasks.\n","authors":["Carson Denison","Badih Ghazi","Pritish Kamath","Ravi Kumar","Pasin Manurangsi","Krishna Giri Narra","Amer Sinha","Avinash V Varadarajan","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.11896v3.pdf","comment":"AdKDD 2023, 8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.02970v1","updated":"2023-10-04T17:06:32Z","published":"2023-10-04T17:06:32Z","title":"Fast, Expressive SE$(n)$ Equivariant Networks through Weight-Sharing in\n  Position-Orientation Space","summary":"  Based on the theory of homogeneous spaces we derive \\textit{geometrically\noptimal edge attributes} to be used within the flexible message passing\nframework. We formalize the notion of weight sharing in convolutional networks\nas the sharing of message functions over point-pairs that should be treated\nequally. We define equivalence classes of point-pairs that are identical up to\na transformation in the group and derive attributes that uniquely identify\nthese classes. Weight sharing is then obtained by conditioning message\nfunctions on these attributes. As an application of the theory, we develop an\nefficient equivariant group convolutional network for processing 3D point\nclouds. The theory of homogeneous spaces tells us how to do group convolutions\nwith feature maps over the homogeneous space of positions $\\mathbb{R}^3$,\nposition and orientations $\\mathbb{R}^3 {\\times} S^2$, and the group SE$(3)$\nitself. Among these, $\\mathbb{R}^3 {\\times} S^2$ is an optimal choice due to\nthe ability to represent directional information, which $\\mathbb{R}^3$ methods\ncannot, and it significantly enhances computational efficiency compared to\nindexing features on the full SE$(3)$ group. We empirically support this claim\nby reaching state-of-the-art results -- in accuracy and speed -- on three\ndifferent benchmarks: interatomic potential energy prediction, trajectory\nforecasting in N-body systems, and generating molecules via equivariant\ndiffusion models.\n","authors":["Erik J Bekkers","Sharvaree Vadgama","Rob D Hesselink","Putri A van der Linden","David W Romero"],"pdf_url":"https://arxiv.org/pdf/2310.02970v1.pdf","comment":"Our code is publicly available at https://github.com/ebekkers/ponita"},{"id":"http://arxiv.org/abs/2310.02969v1","updated":"2023-10-04T17:06:30Z","published":"2023-10-04T17:06:30Z","title":"Dual Conic Proxies for AC Optimal Power Flow","summary":"  In recent years, there has been significant interest in the development of\nmachine learning-based optimization proxies for AC Optimal Power Flow (AC-OPF).\nAlthough significant progress has been achieved in predicting high-quality\nprimal solutions, no existing learning-based approach can provide valid dual\nbounds for AC-OPF. This paper addresses this gap by training optimization\nproxies for a convex relaxation of AC-OPF. Namely, the paper considers a\nsecond-order cone (SOC) relaxation of ACOPF, and proposes a novel dual\narchitecture that embeds a fast, differentiable (dual) feasibility recovery,\nthus providing valid dual bounds. The paper combines this new architecture with\na self-supervised learning scheme, which alleviates the need for costly\ntraining data generation. Extensive numerical experiments on medium- and\nlarge-scale power grids demonstrate the efficiency and scalability of the\nproposed methodology.\n","authors":["Guancheng Qiu","Mathieu Tanneau","Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2310.02969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02964v1","updated":"2023-10-04T16:58:25Z","published":"2023-10-04T16:58:25Z","title":"Co-modeling the Sequential and Graphical Route for Peptide","summary":"  Peptides are formed by the dehydration condensation of multiple amino acids.\nThe primary structure of a peptide can be represented either as an amino acid\nsequence or as a molecular graph consisting of atoms and chemical bonds.\nPrevious studies have indicated that deep learning routes specific to\nsequential and graphical peptide forms exhibit comparable performance on\ndownstream tasks. Despite the fact that these models learn representations of\nthe same modality of peptides, we find that they explain their predictions\ndifferently. Considering sequential and graphical models as two experts making\ninferences from different perspectives, we work on fusing expert knowledge to\nenrich the learned representations for improving the discriminative\nperformance. To achieve this, we propose a peptide co-modeling method, RepCon,\nwhich employs a contrastive learning-based framework to enhance the mutual\ninformation of representations from decoupled sequential and graphical\nend-to-end models. It considers representations from the sequential encoder and\nthe graphical encoder for the same peptide sample as a positive pair and learns\nto enhance the consistency of representations between positive sample pairs and\nto repel representations between negative pairs. Empirical studies of RepCon\nand other co-modeling methods are conducted on open-source discriminative\ndatasets, including aggregation propensity, retention time, antimicrobial\npeptide prediction, and family classification from Peptide Database. Our\nresults demonstrate the superiority of the co-modeling approach over\nindependent modeling, as well as the superiority of RepCon over other methods\nunder the co-modeling framework. In addition, the attribution on RepCon further\ncorroborates the validity of the approach at the level of model explanation.\n","authors":["Zihan Liu","Ge Wang","Jiaqi Wang","Jiangbin Zheng","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2310.02964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14509v2","updated":"2023-10-04T16:51:13Z","published":"2023-09-25T20:15:57Z","title":"DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme\n  Long Sequence Transformer Models","summary":"  Computation in a typical Transformer-based large language model (LLM) can be\ncharacterized by batch size, hidden dimension, number of layers, and sequence\nlength. Until now, system works for accelerating LLM training have focused on\nthe first three dimensions: data parallelism for batch size, tensor parallelism\nfor hidden size and pipeline parallelism for model depth or layers. These\nwidely studied forms of parallelism are not targeted or optimized for long\nsequence Transformer models. Given practical application needs for long\nsequence LLM, renewed attentions are being drawn to sequence parallelism.\nHowever, existing works in sequence parallelism are constrained by\nmemory-communication inefficiency, limiting their scalability to long sequence\nlarge models. In this work, we introduce DeepSpeed-Ulysses, a novel, portable\nand effective methodology for enabling highly efficient and scalable LLM\ntraining with extremely long sequence length. DeepSpeed-Ulysses at its core\npartitions input data along the sequence dimension and employs an efficient\nall-to-all collective communication for attention computation. Theoretical\ncommunication analysis shows that whereas other methods incur communication\noverhead as sequence length increases, DeepSpeed-Ulysses maintains constant\ncommunication volume when sequence length and compute devices are increased\nproportionally. Furthermore, experimental evaluations show that\nDeepSpeed-Ulysses trains 2.5x faster with 4x longer sequence length than the\nexisting method SOTA baseline.\n","authors":["Sam Ade Jacobs","Masahiro Tanaka","Chengming Zhang","Minjia Zhang","Shuaiwen Leon Song","Samyam Rajbhandari","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2309.14509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02956v1","updated":"2023-10-04T16:46:26Z","published":"2023-10-04T16:46:26Z","title":"Credit card score prediction using machine learning models: A new\n  dataset","summary":"  The use of credit cards has recently increased, creating an essential need\nfor credit card assessment methods to minimize potential risks. This study\ninvestigates the utilization of machine learning (ML) models for credit card\ndefault prediction system. The main goal here is to investigate the\nbest-performing ML model for new proposed credit card scoring dataset. This new\ndataset includes credit card transaction histories and customer profiles, is\nproposed and tested using a variety of machine learning algorithms, including\nlogistic regression, decision trees, random forests, multi layer perceptron\n(MLP) neural network, XGBoost, and LightGBM. To prepare the data for machine\nlearning models, we perform data pre-proccessing, feature extraction, feature\nselection, and data balancing techniques. Experimental results demonstrate that\nMLP outperforms logistic regression, decision trees, random forests, LightGBM,\nand XGBoost in terms of predictive performance in true positive rate, achieving\nan impressive area under the curve (AUC) of 86.7% and an accuracy rate of\n91.6%, with a recall rate exceeding 80%. These results indicate the superiority\nof MLP in predicting the default customers and assessing the potential risks.\nFurthermore, they help banks and other financial institutions in predicting\nloan defaults at an earlier stage.\n","authors":["Anas Arram","Masri Ayob","Musatafa Abbas Abbood Albadr","Alaa Sulaiman","Dheeb Albashish"],"pdf_url":"https://arxiv.org/pdf/2310.02956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02951v1","updated":"2023-10-04T16:41:36Z","published":"2023-10-04T16:41:36Z","title":"A Fisher-Rao gradient flow for entropy-regularised Markov decision\n  processes in Polish spaces","summary":"  We study the global convergence of a Fisher-Rao policy gradient flow for\ninfinite-horizon entropy-regularised Markov decision processes with Polish\nstate and action space. The flow is a continuous-time analogue of a policy\nmirror descent method. We establish the global well-posedness of the gradient\nflow and demonstrate its exponential convergence to the optimal policy.\nMoreover, we prove the flow is stable with respect to gradient evaluation,\noffering insights into the performance of a natural policy gradient flow with\nlog-linear policy parameterisation. To overcome challenges stemming from the\nlack of the convexity of the objective function and the discontinuity arising\nfrom the entropy regulariser, we leverage the performance difference lemma and\nthe duality relationship between the gradient and mirror descent flows.\n","authors":["Bekzhan Kerimkulov","James-Michael Leahy","David Siska","Lukasz Szpruch","Yufei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02949v1","updated":"2023-10-04T16:39:31Z","published":"2023-10-04T16:39:31Z","title":"Shadow Alignment: The Ease of Subverting Safely-Aligned Language Models","summary":"  Warning: This paper contains examples of harmful language, and reader\ndiscretion is recommended. The increasing open release of powerful large\nlanguage models (LLMs) has facilitated the development of downstream\napplications by reducing the essential cost of data annotation and computation.\nTo ensure AI safety, extensive safety-alignment measures have been conducted to\narmor these models against malicious use (primarily hard prompt attack).\nHowever, beneath the seemingly resilient facade of the armor, there might lurk\na shadow. By simply tuning on 100 malicious examples with 1 GPU hour, these\nsafely aligned LLMs can be easily subverted to generate harmful content.\nFormally, we term a new attack as Shadow Alignment: utilizing a tiny amount of\ndata can elicit safely-aligned models to adapt to harmful tasks without\nsacrificing model helpfulness. Remarkably, the subverted models retain their\ncapability to respond appropriately to regular inquiries. Experiments across 8\nmodels released by 5 different organizations (LLaMa-2, Falcon, InternLM,\nBaiChuan2, Vicuna) demonstrate the effectiveness of shadow alignment attack.\nBesides, the single-turn English-only attack successfully transfers to\nmulti-turn dialogue and other languages. This study serves as a clarion call\nfor a collective effort to overhaul and fortify the safety of open-source LLMs\nagainst malicious attackers.\n","authors":["Xianjun Yang","Xiao Wang","Qi Zhang","Linda Petzold","William Yang Wang","Xun Zhao","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2310.02949v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.02948v1","updated":"2023-10-04T16:36:32Z","published":"2023-10-04T16:36:32Z","title":"HappyFeat -- An interactive and efficient BCI framework for clinical\n  applications","summary":"  Brain-Computer Interface (BCI) systems allow users to perform actions by\ntranslating their brain activity into commands. Such systems usually need a\ntraining phase, consisting in training a classification algorithm to\ndiscriminate between mental states using specific features from the recorded\nsignals. This phase of feature selection and training is crucial for BCI\nperformance and presents specific constraints to be met in a clinical context,\nsuch as post-stroke rehabilitation.\n  In this paper, we present HappyFeat, a software making Motor Imagery (MI)\nbased BCI experiments easier, by gathering all necessary manipulations and\nanalysis in a single convenient GUI and via automation of experiment or\nanalysis parameters. The resulting workflow allows for effortlessly selecting\nthe best features, helping to achieve good BCI performance in time-constrained\nenvironments. Alternative features based on Functional Connectivity can be used\nand compared or combined with Power Spectral Density, allowing a\nnetwork-oriented approach.\n  We then give details of HappyFeat's main mechanisms, and a review of its\nperformances in typical use cases. We also show that it can be used as an\nefficient tool for comparing different metrics extracted from the signals, to\ntrain the classification algorithm. To this end, we show a comparison between\nthe commonly-used Power Spectral Density and network metrics based on\nFunctional Connectivity.\n  HappyFeat is available as an open-source project which can be freely\ndownloaded on GitHub.\n","authors":["Arthur Desbois","Tristan Venot","Fabrizio De Vico Fallani","Marie-Constance Corsi"],"pdf_url":"https://arxiv.org/pdf/2310.02948v1.pdf","comment":"16 pages, 5 figures, 2 tables, \"Annex\" section"},{"id":"http://arxiv.org/abs/2310.00357v2","updated":"2023-10-04T16:34:58Z","published":"2023-09-30T12:27:53Z","title":"Structural Adversarial Objectives for Self-Supervised Representation\n  Learning","summary":"  Within the framework of generative adversarial networks (GANs), we propose\nobjectives that task the discriminator for self-supervised representation\nlearning via additional structural modeling responsibilities. In combination\nwith an efficient smoothness regularizer imposed on the network, these\nobjectives guide the discriminator to learn to extract informative\nrepresentations, while maintaining a generator capable of sampling from the\ndomain. Specifically, our objectives encourage the discriminator to structure\nfeatures at two levels of granularity: aligning distribution characteristics,\nsuch as mean and variance, at coarse scales, and grouping features into local\nclusters at finer scales. Operating as a feature learner within the GAN\nframework frees our self-supervised system from the reliance on hand-crafted\ndata augmentation schemes that are prevalent across contrastive representation\nlearning methods. Across CIFAR-10/100 and an ImageNet subset, experiments\ndemonstrate that equipping GANs with our self-supervised objectives suffices to\nproduce discriminators which, evaluated in terms of representation learning,\ncompete with networks trained by contrastive learning approaches.\n","authors":["Xiao Zhang","Michael Maire"],"pdf_url":"https://arxiv.org/pdf/2310.00357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.13259v3","updated":"2023-10-04T16:30:14Z","published":"2022-01-31T14:07:49Z","title":"Trajectory balance: Improved credit assignment in GFlowNets","summary":"  Generative flow networks (GFlowNets) are a method for learning a stochastic\npolicy for generating compositional objects, such as graphs or strings, from a\ngiven unnormalized density by sequences of actions, where many possible action\nsequences may lead to the same object. We find previously proposed learning\nobjectives for GFlowNets, flow matching and detailed balance, which are\nanalogous to temporal difference learning, to be prone to inefficient credit\npropagation across long action sequences. We thus propose a new learning\nobjective for GFlowNets, trajectory balance, as a more efficient alternative to\npreviously used objectives. We prove that any global minimizer of the\ntrajectory balance objective can define a policy that samples exactly from the\ntarget distribution. In experiments on four distinct domains, we empirically\ndemonstrate the benefits of the trajectory balance objective for GFlowNet\nconvergence, diversity of generated samples, and robustness to long action\nsequences and large action spaces.\n","authors":["Nikolay Malkin","Moksh Jain","Emmanuel Bengio","Chen Sun","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2201.13259v3.pdf","comment":"NeurIPS 2022; see footnotes for code; v3 fixes minor errata"},{"id":"http://arxiv.org/abs/2308.13111v3","updated":"2023-10-04T16:29:23Z","published":"2023-08-24T23:06:21Z","title":"Bayesian low-rank adaptation for large language models","summary":"  Low-rank adaptation (LoRA) has emerged as a new paradigm for cost-efficient\nfine-tuning of large language models (LLMs). However, fine-tuned LLMs often\nbecome overconfident especially when fine-tuned on small datasets. Bayesian\nmethods, with their inherent ability to estimate uncertainty, serve as potent\ntools to mitigate overconfidence and enhance calibration. In this work, we\nintroduce Laplace-LoRA, which applies a Bayesian approach to the LoRA\nparameters. Specifically, Laplace-LoRA applies a Laplace approximation to the\nposterior over the LoRA parameters, considerably improving the calibration of\nfine-tuned LLMs.\n","authors":["Adam X. Yang","Maxime Robeyns","Xi Wang","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2308.13111v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13409v2","updated":"2023-10-04T16:28:03Z","published":"2023-09-23T15:42:54Z","title":"Time-Series Forecasting: Unleashing Long-Term Dependencies with\n  Fractionally Differenced Data","summary":"  This study introduces a novel forecasting strategy that leverages the power\nof fractional differencing (FD) to capture both short- and long-term\ndependencies in time series data. Unlike traditional integer differencing\nmethods, FD preserves memory in series while stabilizing it for modeling\npurposes. By applying FD to financial data from the SPY index and incorporating\nsentiment analysis from news reports, this empirical analysis explores the\neffectiveness of FD in conjunction with binary classification of target\nvariables. Supervised classification algorithms were employed to validate the\nperformance of FD series. The results demonstrate the superiority of FD over\ninteger differencing, as confirmed by Receiver Operating Characteristic/Area\nUnder the Curve (ROCAUC) and Mathews Correlation Coefficient (MCC) evaluations.\n","authors":["Sarit Maitra","Vivek Mishra","Srashti Dwivedi","Sukanya Kundu","Goutam Kumar Kundu"],"pdf_url":"https://arxiv.org/pdf/2309.13409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06419v2","updated":"2023-10-04T16:24:04Z","published":"2023-03-11T14:57:52Z","title":"Use Perturbations when Learning from Explanations","summary":"  Machine learning from explanations (MLX) is an approach to learning that uses\nhuman-provided explanations of relevant or irrelevant features for each input\nto ensure that model predictions are right for the right reasons. Existing MLX\napproaches rely on local model interpretation methods and require strong model\nsmoothing to align model and human explanations, leading to sub-optimal\nperformance. We recast MLX as a robustness problem, where human explanations\nspecify a lower dimensional manifold from which perturbations can be drawn, and\nshow both theoretically and empirically how this approach alleviates the need\nfor strong model smoothing. We consider various approaches to achieving\nrobustness, leading to improved performance over prior MLX methods. Finally, we\nshow how to combine robustness with an earlier MLX method, yielding\nstate-of-the-art results on both synthetic and real-world benchmarks.\n","authors":["Juyeon Heo","Vihari Piratla","Matthew Wicker","Adrian Weller"],"pdf_url":"https://arxiv.org/pdf/2303.06419v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.02942v1","updated":"2023-10-04T16:22:02Z","published":"2023-10-04T16:22:02Z","title":"Online Constraint Tightening in Stochastic Model Predictive Control: A\n  Regression Approach","summary":"  Solving chance-constrained stochastic optimal control problems is a\nsignificant challenge in control. This is because no analytical solutions exist\nfor up to a handful of special cases. A common and computationally efficient\napproach for tackling chance-constrained stochastic optimal control problems\nconsists of reformulating the chance constraints as hard constraints with a\nconstraint-tightening parameter. However, in such approaches, the choice of\nconstraint-tightening parameter remains challenging, and guarantees can mostly\nbe obtained assuming that the process noise distribution is known a priori.\nMoreover, the chance constraints are often not tightly satisfied, leading to\nunnecessarily high costs. This work proposes a data-driven approach for\nlearning the constraint-tightening parameters online during control. To this\nend, we reformulate the choice of constraint-tightening parameter for the\nclosed-loop as a binary regression problem. We then leverage a highly\nexpressive \\gls{gp} model for binary regression to approximate the smallest\nconstraint-tightening parameters that satisfy the chance constraints. By tuning\nthe algorithm parameters appropriately, we show that the resulting\nconstraint-tightening parameters satisfy the chance constraints up to an\narbitrarily small margin with high probability. Our approach yields\nconstraint-tightening parameters that tightly satisfy the chance constraints in\nnumerical experiments, resulting in a lower average cost than three other\nstate-of-the-art approaches.\n","authors":["Alexandre Capone","Tim Brüdigam","Sandra Hirche"],"pdf_url":"https://arxiv.org/pdf/2310.02942v1.pdf","comment":"Submitted to Transactions on Automatic Control"},{"id":"http://arxiv.org/abs/2310.02941v1","updated":"2023-10-04T16:21:23Z","published":"2023-10-04T16:21:23Z","title":"Hoeffding's Inequality for Markov Chains under Generalized\n  Concentrability Condition","summary":"  This paper studies Hoeffding's inequality for Markov chains under the\ngeneralized concentrability condition defined via integral probability metric\n(IPM). The generalized concentrability condition establishes a framework that\ninterpolates and extends the existing hypotheses of Markov chain Hoeffding-type\ninequalities. The flexibility of our framework allows Hoeffding's inequality to\nbe applied beyond the ergodic Markov chains in the traditional sense. We\ndemonstrate the utility by applying our framework to several non-asymptotic\nanalyses arising from the field of machine learning, including (i) a\ngeneralization bound for empirical risk minimization with Markovian samples,\n(ii) a finite sample guarantee for Ployak-Ruppert averaging of SGD, and (iii) a\nnew regret bound for rested Markovian bandits with general state space.\n","authors":["Hao Chen","Abhishek Gupta","Yin Sun","Ness Shroff"],"pdf_url":"https://arxiv.org/pdf/2310.02941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02932v1","updated":"2023-10-04T16:09:48Z","published":"2023-10-04T16:09:48Z","title":"Assessing Large Language Models on Climate Information","summary":"  Understanding how climate change affects us and learning about available\nsolutions are key steps toward empowering individuals and communities to\nmitigate and adapt to it. As Large Language Models (LLMs) rise in popularity,\nit is necessary to assess their capability in this domain. In this study, we\npresent a comprehensive evaluation framework, grounded in science communication\nprinciples, to analyze LLM responses to climate change topics. Our framework\nemphasizes both the presentational and epistemological adequacy of answers,\noffering a fine-grained analysis of LLM generations. Spanning 8 dimensions, our\nframework discerns up to 30 distinct issues in model outputs. The task is a\nreal-world example of a growing number of challenging problems where AI can\ncomplement and lift human performance. We introduce a novel and practical\nprotocol for scalable oversight that uses AI Assistance and relies on raters\nwith relevant educational backgrounds. We evaluate several recent LLMs and\nconduct a comprehensive analysis of the results, shedding light on both the\npotential and the limitations of LLMs in the realm of climate communication.\n","authors":["Jannis Bulian","Mike S. Schäfer","Afra Amini","Heidi Lam","Massimiliano Ciaramita","Ben Gaiarin","Michelle Chen Huebscher","Christian Buck","Niels Mede","Markus Leippold","Nadine Strauss"],"pdf_url":"https://arxiv.org/pdf/2310.02932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02931v1","updated":"2023-10-04T16:09:35Z","published":"2023-10-04T16:09:35Z","title":"Graph data modelling for outcome prediction in oropharyngeal cancer\n  patients","summary":"  Graph neural networks (GNNs) are becoming increasingly popular in the medical\ndomain for the tasks of disease classification and outcome prediction. Since\npatient data is not readily available as a graph, most existing methods either\nmanually define a patient graph, or learn a latent graph based on pairwise\nsimilarities between the patients. There are also hypergraph neural network\n(HGNN)-based methods that were introduced recently to exploit potential higher\norder associations between the patients by representing them as a hypergraph.\nIn this work, we propose a patient hypergraph network (PHGN), which has been\ninvestigated in an inductive learning setup for binary outcome prediction in\noropharyngeal cancer (OPC) patients using computed tomography (CT)-based\nradiomic features for the first time. Additionally, the proposed model was\nextended to perform time-to-event analyses, and compared with GNN and baseline\nlinear models.\n","authors":["Nithya Bhasker","Stefan Leger","Alexander Zwanenburg","Chethan Babu Reddy","Sebastian Bodenstedt","Steffen Löck","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2310.02931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02925v1","updated":"2023-10-04T16:05:36Z","published":"2023-10-04T16:05:36Z","title":"Optimal Transport with Adaptive Regularisation","summary":"  Regularising the primal formulation of optimal transport (OT) with a strictly\nconvex term leads to enhanced numerical complexity and a denser transport plan.\nMany formulations impose a global constraint on the transport plan, for\ninstance by relying on entropic regularisation. As it is more expensive to\ndiffuse mass for outlier points compared to central ones, this typically\nresults in a significant imbalance in the way mass is spread across the points.\nThis can be detrimental for some applications where a minimum of smoothing is\nrequired per point. To remedy this, we introduce OT with Adaptive\nRegularIsation (OTARI), a new formulation of OT that imposes constraints on the\nmass going in or/and out of each point. We then showcase the benefits of this\napproach for domain adaptation.\n","authors":["Hugues Van Assel","Titouan Vayer","Remi Flamary","Nicolas Courty"],"pdf_url":"https://arxiv.org/pdf/2310.02925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02920v1","updated":"2023-10-04T16:01:43Z","published":"2023-10-04T16:01:43Z","title":"Enhancing Ayurvedic Diagnosis using Multinomial Naive Bayes and K-modes\n  Clustering: An Investigation into Prakriti Types and Dosha Overlapping","summary":"  The identification of Prakriti types for the human body is a long-lost\nmedical practice in finding the harmony between the nature of human beings and\ntheir behaviour. There are 3 fundamental Prakriti types of individuals. A\nperson can belong to any Dosha. In the existing models, researchers have made\nuse of SVM, KNN, PCA, Decision Tree, and various other algorithms. The output\nof these algorithms was quite decent, but it can be enhanced with the help of\nMultinomial Naive Bayes and K-modes clustering. Most of the researchers have\nconfined themselves to 3 basic classes. This might not be accurate in the\nreal-world scenario, where overlapping might occur. Considering these, we have\nclassified the Doshas into 7 categories, which includes overlapping of Doshas.\nThese are namely, VATT-Dosha, PITT-Dosha, KAPH-Dosha, VATT-PITT-Dosha,\nPITT-KAPH-Dosha, KAPH-VATT-Dosha, and VATT-PITT-KAPH-Dosha. The data used\ncontains a balanced set of all individual entries on which preprocessing steps\nof machine learning have been performed. Chi-Square test for handling\ncategorical data is being used for feature selection. For model fitting, the\nmethod used in this approach is K-modes clustering. The empirical results\ndemonstrate a better result while using the MNB classifier. All key findings of\nthis work have achieved 0.90 accuracy, 0.81 precision, 0.91 F-score, and 0.90\nrecall. The discussion suggests a provident analysis of the seven clusters and\npredicts their occurrence. The results have been consolidated to improve the\nAyurvedic advancements with machine learning.\n","authors":["Pranav Bidve","Shalini Mishra","Annapurna J"],"pdf_url":"https://arxiv.org/pdf/2310.02920v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.02919v1","updated":"2023-10-04T16:01:06Z","published":"2023-10-04T16:01:06Z","title":"Attention-based Multi-task Learning for Base Editor Outcome Prediction","summary":"  Human genetic diseases often arise from point mutations, emphasizing the\ncritical need for precise genome editing techniques. Among these, base editing\nstands out as it allows targeted alterations at the single nucleotide level.\nHowever, its clinical application is hindered by low editing efficiency and\nunintended mutations, necessitating extensive trial-and-error experimentation\nin the laboratory. To speed up this process, we present an attention-based\ntwo-stage machine learning model that learns to predict the likelihood of all\npossible editing outcomes for a given genomic target sequence. We further\npropose a multi-task learning schema to jointly learn multiple base editors\n(i.e. variants) at once. Our model's predictions consistently demonstrated a\nstrong correlation with the actual experimental results on multiple datasets\nand base editor variants. These results provide further validation for the\nmodels' capacity to enhance and accelerate the process of refining base editing\ndesigns.\n","authors":["Amina Mollaysa","Ahmed Allam","Michael Krauthammer"],"pdf_url":"https://arxiv.org/pdf/2310.02919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17357v2","updated":"2023-10-04T15:55:12Z","published":"2023-09-29T16:03:25Z","title":"Module-wise Training of Neural Networks via the Minimizing Movement\n  Scheme","summary":"  Greedy layer-wise or module-wise training of neural networks is compelling in\nconstrained and on-device settings where memory is limited, as it circumvents a\nnumber of problems of end-to-end back-propagation. However, it suffers from a\nstagnation problem, whereby early layers overfit and deeper layers stop\nincreasing the test accuracy after a certain depth. We propose to solve this\nissue by introducing a module-wise regularization inspired by the minimizing\nmovement scheme for gradient flows in distribution space. We call the method\nTRGL for Transport Regularized Greedy Learning and study it theoretically,\nproving that it leads to greedy modules that are regular and that progressively\nsolve the task. Experimentally, we show improved accuracy of module-wise\ntraining of various architectures such as ResNets, Transformers and VGG, when\nour regularization is added, superior to that of other module-wise training\nmethods and often to end-to-end training, with as much as 60% less memory\nusage.\n","authors":["Skander Karkar","Ibrahim Ayed","Emmanuel de Bézenac","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2309.17357v2.pdf","comment":"NeurIPS 2023. arXiv admin note: text overlap with arXiv:2210.00949"},{"id":"http://arxiv.org/abs/2305.18171v2","updated":"2023-10-04T15:55:04Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":"  Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further; first, the\nincorporation of pseudo-positives to prevent the loss saturation problem under\nmassive false negatives; second, mixed sample data augmentation for\nprobabilistic matching. Experimental results on MS-COCO Caption and two\nextended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of\nPCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is\nalso evaluated under noisy image-text correspondences. In addition, the\npotential applicability of PCME++ in automatic prompt tuning for zero-shot\nclassification is shown. The code is available at\nhttps://naver-ai.github.io/pcmepp/.\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v2.pdf","comment":"Code: https://github.com/naver-ai/pcmepp. Project page:\n  https://naver-ai.github.io/pcmepp/. 26 pages, 1.2 MB"},{"id":"http://arxiv.org/abs/2310.02913v1","updated":"2023-10-04T15:50:05Z","published":"2023-10-04T15:50:05Z","title":"ELUQuant: Event-Level Uncertainty Quantification in Deep Inelastic\n  Scattering","summary":"  We introduce a physics-informed Bayesian Neural Network (BNN) with flow\napproximated posteriors using multiplicative normalizing flows (MNF) for\ndetailed uncertainty quantification (UQ) at the physics event-level. Our method\nis capable of identifying both heteroskedastic aleatoric and epistemic\nuncertainties, providing granular physical insights. Applied to Deep Inelastic\nScattering (DIS) events, our model effectively extracts the kinematic variables\n$x$, $Q^2$, and $y$, matching the performance of recent deep learning\nregression techniques but with the critical enhancement of event-level UQ. This\ndetailed description of the underlying uncertainty proves invaluable for\ndecision-making, especially in tasks like event filtering. It also allows for\nthe reduction of true inaccuracies without directly accessing the ground truth.\nA thorough DIS simulation using the H1 detector at HERA indicates possible\napplications for the future EIC. Additionally, this paves the way for related\ntasks such as data quality monitoring and anomaly detection. Remarkably, our\napproach effectively processes large samples at high rates.\n","authors":["Cristiano Fanelli","James Giroux"],"pdf_url":"https://arxiv.org/pdf/2310.02913v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.02904v1","updated":"2023-10-04T15:42:26Z","published":"2023-10-04T15:42:26Z","title":"Spline-based neural network interatomic potentials: blending classical\n  and machine learning models","summary":"  While machine learning (ML) interatomic potentials (IPs) are able to achieve\naccuracies nearing the level of noise inherent in the first-principles data to\nwhich they are trained, it remains to be shown if their increased complexities\nare strictly necessary for constructing high-quality IPs. In this work, we\nintroduce a new MLIP framework which blends the simplicity of spline-based MEAM\n(s-MEAM) potentials with the flexibility of a neural network (NN) architecture.\nThe proposed framework, which we call the spline-based neural network potential\n(s-NNP), is a simplified version of the traditional NNP that can be used to\ndescribe complex datasets in a computationally efficient manner. We demonstrate\nhow this framework can be used to probe the boundary between classical and ML\nIPs, highlighting the benefits of key architectural changes. Furthermore, we\nshow that using spline filters for encoding atomic environments results in a\nreadily interpreted embedding layer which can be coupled with modifications to\nthe NN to incorporate expected physical behaviors and improve overall\ninterpretability. Finally, we test the flexibility of the spline filters,\nobserving that they can be shared across multiple chemical systems in order to\nprovide a convenient reference point from which to begin performing\ncross-system analyses.\n","authors":["Joshua A. Vita","Dallas R. Trinkle"],"pdf_url":"https://arxiv.org/pdf/2310.02904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02903v1","updated":"2023-10-04T15:42:23Z","published":"2023-10-04T15:42:23Z","title":"FroSSL: Frobenius Norm Minimization for Self-Supervised Learning","summary":"  Self-supervised learning (SSL) is an increasingly popular paradigm for\nrepresentation learning. Recent methods can be classified as\nsample-contrastive, dimension-contrastive, or asymmetric network-based, with\neach family having its own approach to avoiding informational collapse. While\ndimension-contrastive methods converge to similar solutions as\nsample-contrastive methods, it can be empirically shown that some methods\nrequire more epochs of training to converge. Motivated by closing this divide,\nwe present the objective function FroSSL which is both sample- and\ndimension-contrastive up to embedding normalization. FroSSL works by minimizing\ncovariance Frobenius norms for avoiding collapse and minimizing mean-squared\nerror for augmentation invariance. We show that FroSSL converges more quickly\nthan a variety of other SSL methods and provide theoretical and empirical\nsupport that this faster convergence is due to how FroSSL affects the\neigenvalues of the embedding covariance matrices. We also show that FroSSL\nlearns competitive representations on linear probe evaluation when used to\ntrain a ResNet18 on the CIFAR-10, CIFAR-100, STL-10, and ImageNet datasets.\n","authors":["Oscar Skean","Aayush Dhakal","Nathan Jacobs","Luis Gonzalo Sanchez Giraldo"],"pdf_url":"https://arxiv.org/pdf/2310.02903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02902v1","updated":"2023-10-04T15:40:07Z","published":"2023-10-04T15:40:07Z","title":"Searching for High-Value Molecules Using Reinforcement Learning and\n  Transformers","summary":"  Reinforcement learning (RL) over text representations can be effective for\nfinding high-value policies that can search over graphs. However, RL requires\ncareful structuring of the search space and algorithm design to be effective in\nthis challenge. Through extensive experiments, we explore how different design\nchoices for text grammar and algorithmic choices for training can affect an RL\npolicy's ability to generate molecules with desired properties. We arrive at a\nnew RL-based molecular design algorithm (ChemRLformer) and perform a thorough\nanalysis using 25 molecule design tasks, including computationally complex\nprotein docking simulations. From this analysis, we discover unique insights in\nthis problem space and show that ChemRLformer achieves state-of-the-art\nperformance while being more straightforward than prior work by demystifying\nwhich design choices are actually helpful for text-based molecule design.\n","authors":["Raj Ghugare","Santiago Miret","Adriana Hugessen","Mariano Phielipp","Glen Berseth"],"pdf_url":"https://arxiv.org/pdf/2310.02902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02897v1","updated":"2023-10-04T15:36:33Z","published":"2023-10-04T15:36:33Z","title":"Recovery of Training Data from Overparameterized Autoencoders: An\n  Inverse Problem Perspective","summary":"  We study the recovery of training data from overparameterized autoencoder\nmodels. Given a degraded training sample, we define the recovery of the\noriginal sample as an inverse problem and formulate it as an optimization task.\nIn our inverse problem, we use the trained autoencoder to implicitly define a\nregularizer for the particular training dataset that we aim to retrieve from.\nWe develop the intricate optimization task into a practical method that\niteratively applies the trained autoencoder and relatively simple computations\nthat estimate and address the unknown degradation operator. We evaluate our\nmethod for blind inpainting where the goal is to recover training images from\ndegradation of many missing pixels in an unknown pattern. We examine various\ndeep autoencoder architectures, such as fully connected and U-Net (with various\nnonlinearities and at diverse train loss values), and show that our method\nsignificantly outperforms previous methods for training data recovery from\nautoencoders. Importantly, our method greatly improves the recovery performance\nalso in settings that were previously considered highly challenging, and even\nimpractical, for such retrieval.\n","authors":["Koren Abitbul","Yehuda Dar"],"pdf_url":"https://arxiv.org/pdf/2310.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02895v1","updated":"2023-10-04T15:32:27Z","published":"2023-10-04T15:32:27Z","title":"CoLiDE: Concomitant Linear DAG Estimation","summary":"  We deal with the combinatorial problem of learning directed acyclic graph\n(DAG) structure from observational data adhering to a linear structural\nequation model (SEM). Leveraging advances in differentiable, nonconvex\ncharacterizations of acyclicity, recent efforts have advocated a continuous\nconstrained optimization paradigm to efficiently explore the space of DAGs.\nMost existing methods employ lasso-type score functions to guide this search,\nwhich (i) require expensive penalty parameter retuning when the\n$\\textit{unknown}$ SEM noise variances change across problem instances; and\n(ii) implicitly rely on limiting homoscedasticity assumptions. In this work, we\npropose a new convex score function for sparsity-aware learning of linear DAGs,\nwhich incorporates concomitant estimation of scale and thus effectively\ndecouples the sparsity parameter from the exogenous noise levels.\nRegularization via a smooth, nonconvex acyclicity penalty term yields CoLiDE\n($\\textbf{Co}$ncomitant $\\textbf{Li}$near $\\textbf{D}$AG\n$\\textbf{E}$stimation), a regression-based criterion amenable to efficient\ngradient computation and closed-form estimation of noise variances in\nheteroscedastic scenarios. Our algorithm outperforms state-of-the-art methods\nwithout incurring added complexity, especially when the DAGs are larger and the\nnoise level profile is heterogeneous. We also find CoLiDE exhibits enhanced\nstability manifested via reduced standard deviations in several domain-specific\nmetrics, underscoring the robustness of our novel linear DAG estimator.\n","authors":["Seyed Saman Saboksayr","Gonzalo Mateos","Mariano Tepper"],"pdf_url":"https://arxiv.org/pdf/2310.02895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01095v3","updated":"2023-10-04T15:26:06Z","published":"2023-06-01T19:10:57Z","title":"Large-Batch, Iteration-Efficient Neural Bayesian Design Optimization","summary":"  Bayesian optimization (BO) provides a powerful framework for optimizing\nblack-box, expensive-to-evaluate functions. It is therefore an attractive tool\nfor engineering design problems, typically involving multiple objectives.\nThanks to the rapid advances in fabrication and measurement methods as well as\nparallel computing infrastructure, querying many design problems can be heavily\nparallelized. This class of problems challenges BO with an unprecedented setup\nwhere it has to deal with very large batches, shifting its focus from sample\nefficiency to iteration efficiency. We present a novel Bayesian optimization\nframework specifically tailored to address these limitations. Our key\ncontribution is a highly scalable, sample-based acquisition function that\nperforms a non-dominated sorting of not only the objectives but also their\nassociated uncertainty. We show that our acquisition function in combination\nwith different Bayesian neural network surrogates is effective in\ndata-intensive environments with a minimal number of iterations. We demonstrate\nthe superiority of our method by comparing it with state-of-the-art\nmulti-objective optimizations. We perform our evaluation on two real-world\nproblems -- airfoil design and 3D printing -- showcasing the applicability and\nefficiency of our approach. Our code is available at:\nhttps://github.com/an-on-ym-ous/lbn_mobo\n","authors":["Navid Ansari","Hans-Peter Seidel","Vahid Babaei"],"pdf_url":"https://arxiv.org/pdf/2306.01095v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19685v2","updated":"2023-10-04T15:23:47Z","published":"2023-05-31T09:28:03Z","title":"Deep Stochastic Mechanics","summary":"  This paper introduces a novel deep-learning-based approach for numerical\nsimulation of a time-evolving Schr\\\"odinger equation inspired by stochastic\nmechanics and generative diffusion models. Unlike existing approaches, which\nexhibit computational complexity that scales exponentially in the problem\ndimension, our method allows us to adapt to the latent low-dimensional\nstructure of the wave function by sampling from the Markovian diffusion.\nDepending on the latent dimension, our method may have far lower computational\ncomplexity in higher dimensions. Moreover, we propose novel equations for\nstochastic quantum mechanics, resulting in linear computational complexity with\nrespect to the number of dimensions. Numerical simulations verify our\ntheoretical findings and show a significant advantage of our method compared to\nother deep-learning-based approaches used for quantum mechanics.\n","authors":["Elena Orlova","Aleksei Ustimenko","Ruoxi Jiang","Peter Y. Lu","Rebecca Willett"],"pdf_url":"https://arxiv.org/pdf/2305.19685v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02885v1","updated":"2023-10-04T15:21:54Z","published":"2023-10-04T15:21:54Z","title":"Something for (almost) nothing: Improving deep ensemble calibration\n  using unlabeled data","summary":"  We present a method to improve the calibration of deep ensembles in the small\ntraining data regime in the presence of unlabeled data. Our approach is\nextremely simple to implement: given an unlabeled set, for each unlabeled data\npoint, we simply fit a different randomly selected label with each ensemble\nmember. We provide a theoretical analysis based on a PAC-Bayes bound which\nguarantees that if we fit such a labeling on unlabeled data, and the true\nlabels on the training data, we obtain low negative log-likelihood and high\nensemble diversity on testing samples. Empirically, through detailed\nexperiments, we find that for low to moderately-sized training sets, our\nensembles are more diverse and provide better calibration than standard\nensembles, sometimes significantly.\n","authors":["Konstantinos Pitas","Julyan Arbel"],"pdf_url":"https://arxiv.org/pdf/2310.02885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00673v2","updated":"2023-10-04T15:15:00Z","published":"2023-10-01T13:52:28Z","title":"Learning Type Inference for Enhanced Dataflow Analysis","summary":"  Statically analyzing dynamically-typed code is a challenging endeavor, as\neven seemingly trivial tasks such as determining the targets of procedure calls\nare non-trivial without knowing the types of objects at compile time.\nAddressing this challenge, gradual typing is increasingly added to\ndynamically-typed languages, a prominent example being TypeScript that\nintroduces static typing to JavaScript. Gradual typing improves the developer's\nability to verify program behavior, contributing to robust, secure and\ndebuggable programs. In practice, however, users only sparsely annotate types\ndirectly. At the same time, conventional type inference faces\nperformance-related challenges as program size grows. Statistical techniques\nbased on machine learning offer faster inference, but although recent\napproaches demonstrate overall improved accuracy, they still perform\nsignificantly worse on user-defined types than on the most common built-in\ntypes. Limiting their real-world usefulness even more, they rarely integrate\nwith user-facing applications. We propose CodeTIDAL5, a Transformer-based model\ntrained to reliably predict type annotations. For effective result retrieval\nand re-integration, we extract usage slices from a program's code property\ngraph. Comparing our approach against recent neural type inference systems, our\nmodel outperforms the current state-of-the-art by 7.85% on the\nManyTypes4TypeScript benchmark, achieving 71.27% accuracy overall. Furthermore,\nwe present JoernTI, an integration of our approach into Joern, an open source\nstatic analysis tool, and demonstrate that the analysis benefits from the\nadditional type information. As our model allows for fast inference times even\non commodity CPUs, making our system available through Joern leads to high\naccessibility and facilitates security research.\n","authors":["Lukas Seidel","Sedick David Baker Effendi","Xavier Pinho","Konrad Rieck","Brink van der Merwe","Fabian Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2310.00673v2.pdf","comment":"- fixed last author's name - fixed header"},{"id":"http://arxiv.org/abs/2310.02877v1","updated":"2023-10-04T15:11:26Z","published":"2023-10-04T15:11:26Z","title":"Stationarity without mean reversion: Improper Gaussian process\n  regression and improper kernels","summary":"  Gaussian processes (GP) regression has gained substantial popularity in\nmachine learning applications. The behavior of a GP regression depends on the\nchoice of covariance function. Stationary covariance functions are favorite in\nmachine learning applications. However, (non-periodic) stationary covariance\nfunctions are always mean reverting and can therefore exhibit pathological\nbehavior when applied to data that does not relax to a fixed global mean value.\nIn this paper, we show that it is possible to use improper GP prior with\ninfinite variance to define processes that are stationary but not mean\nreverting. To this aim, we introduce a large class of improper kernels that can\nonly be defined in this improper regime. Specifically, we introduce the Smooth\nWalk kernel, which produces infinitely smooth samples, and a family of improper\nMat\\'ern kernels, which can be defined to be $j$-times differentiable for any\ninteger $j$. The resulting posterior distributions can be computed analytically\nand it involves a simple correction of the usual formulas. By analyzing both\nsynthetic and real data, we demonstrate that these improper kernels solve some\nknown pathologies of mean reverting GP regression while retaining most of the\nfavourable properties of ordinary smooth stationary kernels.\n","authors":["Luca Ambrogioni"],"pdf_url":"https://arxiv.org/pdf/2310.02877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02874v1","updated":"2023-10-04T15:09:40Z","published":"2023-10-04T15:09:40Z","title":"Recent Methodological Advances in Federated Learning for Healthcare","summary":"  For healthcare datasets, it is often not possible to combine data samples\nfrom multiple sites due to ethical, privacy or logistical concerns. Federated\nlearning allows for the utilisation of powerful machine learning algorithms\nwithout requiring the pooling of data. Healthcare data has many simultaneous\nchallenges which require new methodologies to address, such as highly-siloed\ndata, class imbalance, missing data, distribution shifts and non-standardised\nvariables. Federated learning adds significant methodological complexity to\nconventional centralised machine learning, requiring distributed optimisation,\ncommunication between nodes, aggregation of models and redistribution of\nmodels. In this systematic review, we consider all papers on Scopus that were\npublished between January 2015 and February 2023 and which describe new\nfederated learning methodologies for addressing challenges with healthcare\ndata. We performed a detailed review of the 89 papers which fulfilled these\ncriteria. Significant systemic issues were identified throughout the literature\nwhich compromise the methodologies in many of the papers reviewed. We give\ndetailed recommendations to help improve the quality of the methodology\ndevelopment for federated learning in healthcare.\n","authors":["Fan Zhang","Daniel Kreuter","Yichen Chen","Sören Dittmer","Samuel Tull","Tolou Shadbahr","BloodCounts! Collaboration","Jacobus Preller","James H. F. Rudd","John A. D. Aston","Carola-Bibiane Schönlieb","Nicholas Gleadall","Michael Roberts"],"pdf_url":"https://arxiv.org/pdf/2310.02874v1.pdf","comment":"Supplementary table of extracted data at the end of the document"},{"id":"http://arxiv.org/abs/1911.02903v4","updated":"2023-10-04T15:07:57Z","published":"2019-11-07T13:48:15Z","title":"How Implicit Regularization of ReLU Neural Networks Characterizes the\n  Learned Function -- Part I: the 1-D Case of Two Layers with Random First\n  Layer","summary":"  In this paper, we consider one dimensional (shallow) ReLU neural networks in\nwhich weights are chosen randomly and only the terminal layer is trained.\nFirst, we mathematically show that for such networks L2-regularized regression\ncorresponds in function space to regularizing the estimate's second derivative\nfor fairly general loss functionals. For least squares regression, we show that\nthe trained network converges to the smooth spline interpolation of the\ntraining data as the number of hidden nodes tends to infinity. Moreover, we\nderive a novel correspondence between the early stopped gradient descent\n(without any explicit regularization of the weights) and the smoothing spline\nregression.\n","authors":["Jakob Heiss","Josef Teichmann","Hanna Wutte"],"pdf_url":"https://arxiv.org/pdf/1911.02903v4.pdf","comment":"adding Appendix C for more intuition, fixing typos, improving\n  formulations, (moving end of Section 3.1 into Appendix B)"},{"id":"http://arxiv.org/abs/2310.02870v1","updated":"2023-10-04T15:04:13Z","published":"2023-10-04T15:04:13Z","title":"Stable and Interpretable Deep Learning for Tabular Data: Introducing\n  InterpreTabNet with the Novel InterpreStability Metric","summary":"  As Artificial Intelligence (AI) integrates deeper into diverse sectors, the\nquest for powerful models has intensified. While significant strides have been\nmade in boosting model capabilities and their applicability across domains, a\nglaring challenge persists: many of these state-of-the-art models remain as\nblack boxes. This opacity not only complicates the explanation of model\ndecisions to end-users but also obstructs insights into intermediate processes\nfor model designers. To address these challenges, we introduce InterpreTabNet,\na model designed to enhance both classification accuracy and interpretability\nby leveraging the TabNet architecture with an improved attentive module. This\ndesign ensures robust gradient propagation and computational stability.\nAdditionally, we present a novel evaluation metric, InterpreStability, which\nquantifies the stability of a model's interpretability. The proposed model and\nmetric mark a significant stride forward in explainable models' research,\nsetting a standard for transparency and interpretability in AI model design and\napplication across diverse sectors. InterpreTabNet surpasses other leading\nsolutions in tabular data analysis across varied application scenarios, paving\nthe way for further research into creating deep-learning models that are both\nhighly accurate and inherently explainable. The introduction of the\nInterpreStability metric ensures that the interpretability of future models can\nbe measured and compared in a consistent and rigorous manner. Collectively,\nthese contributions have the potential to promote the design principles and\ndevelopment of next-generation interpretable AI models, widening the adoption\nof interpretable AI solutions in critical decision-making environments.\n","authors":["Shiyun Wa","Xinai Lu","Minjuan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02870v1.pdf","comment":"34 pages, 7 figures, 8 tables"},{"id":"http://arxiv.org/abs/2310.02869v1","updated":"2023-10-04T15:03:56Z","published":"2023-10-04T15:03:56Z","title":"Harmonic Control Lyapunov Barrier Functions for Constrained Optimal\n  Control with Reach-Avoid Specifications","summary":"  This paper introduces harmonic control Lyapunov barrier functions (harmonic\nCLBF) that aid in constrained control problems such as reach-avoid problems.\nHarmonic CLBFs exploit the maximum principle that harmonic functions satisfy to\nencode the properties of control Lyapunov barrier functions (CLBFs). As a\nresult, they can be initiated at the start of an experiment rather than trained\nbased on sample trajectories. The control inputs are selected to maximize the\ninner product of the system dynamics with the steepest descent direction of the\nharmonic CLBF. Numerical results are presented with four different systems\nunder different reach-avoid environments. Harmonic CLBFs show a significantly\nlow risk of entering unsafe regions and a high probability of entering the goal\nregion.\n","authors":["Amartya Mukherjee","Ruikun Zhou","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10792v3","updated":"2023-10-04T15:00:38Z","published":"2023-08-21T15:35:16Z","title":"Instruction Tuning for Large Language Models: A Survey","summary":"  This paper surveys research works in the quickly advancing field of\ninstruction tuning (IT), a crucial technique to enhance the capabilities and\ncontrollability of large language models (LLMs). Instruction tuning refers to\nthe process of further training LLMs on a dataset consisting of\n\\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the\ngap between the next-word prediction objective of LLMs and the users' objective\nof having LLMs adhere to human instructions. In this work, we make a systematic\nreview of the literature, including the general methodology of IT, the\nconstruction of IT datasets, the training of IT models, and applications to\ndifferent modalities, domains and applications, along with an analysis on\naspects that influence the outcome of IT (e.g., generation of instruction\noutputs, size of the instruction dataset, etc). We also review the potential\npitfalls of IT along with criticism against it, along with efforts pointing out\ncurrent deficiencies of existing strategies and suggest some avenues for\nfruitful research. Project page: github.com/xiaoya-li/Instruction-Tuning-Survey\n","authors":["Shengyu Zhang","Linfeng Dong","Xiaoya Li","Sen Zhang","Xiaofei Sun","Shuhe Wang","Jiwei Li","Runyi Hu","Tianwei Zhang","Fei Wu","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10792v3.pdf","comment":"A Survey paper, Pre-print"},{"id":"http://arxiv.org/abs/2305.11857v3","updated":"2023-10-04T14:56:11Z","published":"2023-05-19T17:48:21Z","title":"Computing high-dimensional optimal transport by flow neural networks","summary":"  Flow-based models are widely used in generative tasks, including normalizing\nflow, where a neural network transports from a data distribution $P$ to a\nnormal distribution. This work develops a flow-based model that transports from\n$P$ to an arbitrary $Q$ where both distributions are only accessible via finite\nsamples. We propose to learn the dynamic optimal transport between $P$ and $Q$\nby training a flow neural network. The model is trained to find an invertible\ntransport map between $P$ and $Q$ optimally by minimizing the transport cost.\nThe trained optimal transport flow allows for performing many downstream tasks,\nincluding infinitesimal density ratio estimation and distribution interpolation\nin the latent space for generative models. The effectiveness of the proposed\nmodel on high-dimensional data is empirically demonstrated in mutual\ninformation estimation, energy-based generative models, and image-to-image\ntranslation.\n","authors":["Chen Xu","Xiuyuan Cheng","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2305.11857v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09222v2","updated":"2023-10-04T14:54:34Z","published":"2023-06-15T15:58:04Z","title":"Stochastic Re-weighted Gradient Descent via Distributionally Robust\n  Optimization","summary":"  We develop a re-weighted gradient descent technique for boosting the\nperformance of deep neural networks, which involves importance weighting of\ndata points during each optimization step. Our approach is inspired by\ndistributionally robust optimization with f-divergences, which has been known\nto result in models with improved generalization guarantees. Our re-weighting\nscheme is simple, computationally efficient, and can be combined with many\npopular optimization algorithms such as SGD and Adam. Empirically, we\ndemonstrate the superiority of our approach on various tasks, including\nsupervised learning, domain adaptation. Notably, we obtain improvements of\n+0.7% and +1.44% over SOTA on DomainBed and Tabular classification benchmarks,\nrespectively. Moreover, our algorithm boosts the performance of BERT on GLUE\nbenchmarks by +1.94%, and ViT on ImageNet-1K by +1.01%. These results\ndemonstrate the effectiveness of the proposed approach, indicating its\npotential for improving performance in diverse domains.\n","authors":["Ramnath Kumar","Kushal Majmundar","Dheeraj Nagaraj","Arun Sai Suggala"],"pdf_url":"https://arxiv.org/pdf/2306.09222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02864v1","updated":"2023-10-04T14:54:34Z","published":"2023-10-04T14:54:34Z","title":"Estimation of Models with Limited Data by Leveraging Shared Structure","summary":"  Modern data sets, such as those in healthcare and e-commerce, are often\nderived from many individuals or systems but have insufficient data from each\nsource alone to separately estimate individual, often high-dimensional, model\nparameters. If there is shared structure among systems however, it may be\npossible to leverage data from other systems to help estimate individual\nparameters, which could otherwise be non-identifiable. In this paper, we assume\nsystems share a latent low-dimensional parameter space and propose a method for\nrecovering $d$-dimensional parameters for $N$ different linear systems, even\nwhen there are only $T<d$ observations per system. To do so, we develop a\nthree-step algorithm which estimates the low-dimensional subspace spanned by\nthe systems' parameters and produces refined parameter estimates within the\nsubspace. We provide finite sample subspace estimation error guarantees for our\nproposed method. Finally, we experimentally validate our method on simulations\nwith i.i.d. regression data and as well as correlated time series data.\n","authors":["Maryann Rui","Thibaut Horel","Munther Dahleh"],"pdf_url":"https://arxiv.org/pdf/2310.02864v1.pdf","comment":"Accepted to IEEE Conference on Decision and Control (CDC) 2023"},{"id":"http://arxiv.org/abs/2310.02863v1","updated":"2023-10-04T14:51:07Z","published":"2023-10-04T14:51:07Z","title":"Conformal Predictions for Longitudinal Data","summary":"  We introduce Longitudinal Predictive Conformal Inference (LPCI), a novel\ndistribution-free conformal prediction algorithm for longitudinal data. Current\nconformal prediction approaches for time series data predominantly focus on the\nunivariate setting, and thus lack cross-sectional coverage when applied\nindividually to each time series in a longitudinal dataset. The current\nstate-of-the-art for longitudinal data relies on creating infinitely-wide\nprediction intervals to guarantee both cross-sectional and asymptotic\nlongitudinal coverage. The proposed LPCI method addresses this by ensuring that\nboth longitudinal and cross-sectional coverages are guaranteed without\nresorting to infinitely wide intervals. In our approach, we model the residual\ndata as a quantile fixed-effects regression problem, constructing prediction\nintervals with a trained quantile regressor. Our extensive experiments\ndemonstrate that LPCI achieves valid cross-sectional coverage and outperforms\nexisting benchmarks in terms of longitudinal coverage rates. Theoretically, we\nestablish LPCI's asymptotic coverage guarantees for both dimensions, with\nfinite-width intervals. The robust performance of LPCI in generating reliable\nprediction intervals for longitudinal data underscores its potential for broad\napplications, including in medicine, finance, and supply chain management.\n","authors":["Devesh Batra","Salvatore Mercuri","Raad Khraishi"],"pdf_url":"https://arxiv.org/pdf/2310.02863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02862v1","updated":"2023-10-04T14:50:58Z","published":"2023-10-04T14:50:58Z","title":"A novel asymmetrical autoencoder with a sparsifying discrete cosine\n  Stockwell transform layer for gearbox sensor data compression","summary":"  The lack of an efficient compression model remains a challenge for the\nwireless transmission of gearbox data in non-contact gear fault diagnosis\nproblems. In this paper, we present a signal-adaptive asymmetrical autoencoder\nwith a transform domain layer to compress sensor signals. First, a new discrete\ncosine Stockwell transform (DCST) layer is introduced to replace linear layers\nin a multi-layer autoencoder. A trainable filter is implemented in the DCST\ndomain by utilizing the multiplication property of the convolution. A trainable\nhard-thresholding layer is applied to reduce redundant data in the DCST layer\nto make the feature map sparse. In comparison to the linear layer, the DCST\nlayer reduces the number of trainable parameters and improves the accuracy of\ndata reconstruction. Second, training the autoencoder with a sparsifying DCST\nlayer only requires a small number of datasets. The proposed method is superior\nto other autoencoder-based methods on the University of Connecticut (UoC) and\nSoutheast University (SEU) gearbox datasets, as the average quality score is\nimproved by 2.00% at the lowest and 32.35% at the highest with a limited number\nof training samples\n","authors":["Xin Zhu","Daoguang Yang","Hongyi Pan","Hamid Reza Karimi","Didem Ozevin","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2310.02862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02861v1","updated":"2023-10-04T14:47:27Z","published":"2023-10-04T14:47:27Z","title":"Rayleigh Quotient Graph Neural Networks for Graph-level Anomaly\n  Detection","summary":"  Graph-level anomaly detection has gained significant attention as it finds\nmany applications in various domains, such as cancer diagnosis and enzyme\nprediction. However, existing methods fail to capture the underlying properties\nof graph anomalies, resulting in unexplainable framework design and\nunsatisfying performance. In this paper, we take a step back and re-investigate\nthe spectral differences between anomalous and normal graphs. Our main\nobservation shows a significant disparity in the accumulated spectral energy\nbetween these two classes. Moreover, we prove that the accumulated spectral\nenergy of the graph signal can be represented by its Rayleigh Quotient,\nindicating that the Rayleigh Quotient is a driving factor behind the anomalous\nproperties of graphs. Motivated by this, we propose Rayleigh Quotient Graph\nNeural Network (RQGNN), the first spectral GNN for graph-level anomaly\ndetection, providing a new perspective on exploring the inherent spectral\nfeatures of anomalous graphs. Specifically, we introduce a novel framework that\nconsists of two components: the Rayleigh Quotient learning component (RQL) and\nChebyshev Wavelet GNN with RQ-pooling (CWGNN-RQ). RQL explicitly captures the\nRayleigh Quotient of graphs and CWGNN-RQ implicitly explores the spectral space\nof graphs. Extensive experiments on 10 real-world datasets show that RQGNN\noutperforms the best rival by 6.74% in Macro-F1 score and 1.44% in AUC,\ndemonstrating the effectiveness of our framework.\n","authors":["Xiangyu Dong","Xingyi Zhang","Sibo Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09121v2","updated":"2023-10-04T14:44:35Z","published":"2022-05-18T20:53:58Z","title":"On the efficiency of Stochastic Quasi-Newton Methods for Deep Learning","summary":"  While first-order methods are popular for solving optimization problems that\narise in large-scale deep learning problems, they come with some acute\ndeficiencies. To diminish such shortcomings, there has been recent interest in\napplying second-order methods such as quasi-Newton based methods which\nconstruct Hessians approximations using only gradient information. The main\nfocus of our work is to study the behaviour of stochastic quasi-Newton\nalgorithms for training deep neural networks. We have analyzed the performance\nof two well-known quasi-Newton updates, the limited memory\nBroyden-Fletcher-Goldfarb-Shanno (BFGS) and the Symmetric Rank One (SR1). This\nstudy fills a gap concerning the real performance of both updates and analyzes\nwhether more efficient training is obtained when using the more robust BFGS\nupdate or the cheaper SR1 formula which allows for indefinite Hessian\napproximations and thus can potentially help to better navigate the\npathological saddle points present in the non-convex loss functions found in\ndeep learning. We present and discuss the results of an extensive experimental\nstudy which includes the effect of batch normalization and network's\narchitecture, the limited memory parameter, the batch size and the type of\nsampling strategy. we show that stochastic quasi-Newton optimizers are\nefficient and able to outperform in some instances the well-known first-order\nAdam optimizer run with the optimal combination of its numerous\nhyperparameters.\n","authors":["Mahsa Yousefi","Angeles Martinez"],"pdf_url":"https://arxiv.org/pdf/2205.09121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15508v2","updated":"2023-10-04T14:42:30Z","published":"2023-05-24T18:56:55Z","title":"How to fix a broken confidence estimator: Evaluating post-hoc methods\n  for selective classification with deep neural networks","summary":"  This paper addresses the problem of selective classification for deep neural\nnetworks, where a model is allowed to abstain from low-confidence predictions\nto avoid potential errors. We focus on so-called post-hoc methods, which\nreplace the confidence estimator of a given classifier without retraining or\nmodifying it, thus being practically appealing. Considering neural networks\nwith softmax outputs, our goal is to identify the best confidence estimator\nthat can be computed directly from the unnormalized logits. This problem is\nmotivated by the intriguing observation in recent work that many classifiers\nappear to have a \"broken\" confidence estimator, in the sense that their\nselective classification performance is much worse than what could be expected\nby their corresponding accuracies. We perform an extensive experimental study\nof many existing and proposed confidence estimators applied to 84 pretrained\nImageNet classifiers available from popular repositories. Our results show that\na simple $p$-norm normalization of the logits, followed by taking the maximum\nlogit as the confidence estimator, can lead to considerable gains in selective\nclassification performance, completely fixing the pathological behavior\nobserved in many classifiers. As a consequence, the selective classification\nperformance of any classifier becomes almost entirely determined by its\ncorresponding accuracy. Moreover, these results are shown to be consistent\nunder distribution shift. We also investigate why certain classifiers innately\nhave a good confidence estimator that apparently cannot be improved by post-hoc\nmethods.\n","authors":["Luís Felipe P. Cattelan","Danilo Silva"],"pdf_url":"https://arxiv.org/pdf/2305.15508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02854v1","updated":"2023-10-04T14:41:41Z","published":"2023-10-04T14:41:41Z","title":"Multi-Domain Causal Representation Learning via Weak Distributional\n  Invariances","summary":"  Causal representation learning has emerged as the center of action in causal\nmachine learning research. In particular, multi-domain datasets present a\nnatural opportunity for showcasing the advantages of causal representation\nlearning over standard unsupervised representation learning. While recent works\nhave taken crucial steps towards learning causal representations, they often\nlack applicability to multi-domain datasets due to over-simplifying assumptions\nabout the data; e.g. each domain comes from a different single-node perfect\nintervention. In this work, we relax these assumptions and capitalize on the\nfollowing observation: there often exists a subset of latents whose certain\ndistributional properties (e.g., support, variance) remain stable across\ndomains; this property holds when, for example, each domain comes from a\nmulti-node imperfect intervention. Leveraging this observation, we show that\nautoencoders that incorporate such invariances can provably identify the stable\nset of latents from the rest across different settings.\n","authors":["Kartik Ahuja","Amin Mansouri","Yixin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01176v2","updated":"2023-10-04T14:41:02Z","published":"2022-10-03T18:44:28Z","title":"PersA-FL: Personalized Asynchronous Federated Learning","summary":"  We study the personalized federated learning problem under asynchronous\nupdates. In this problem, each client seeks to obtain a personalized model that\nsimultaneously outperforms local and global models. We consider two\noptimization-based frameworks for personalization: (i) Model-Agnostic\nMeta-Learning (MAML) and (ii) Moreau Envelope (ME). MAML involves learning a\njoint model adapted for each client through fine-tuning, whereas ME requires a\nbi-level optimization problem with implicit gradients to enforce\npersonalization via regularized losses. We focus on improving the scalability\nof personalized federated learning by removing the synchronous communication\nassumption. Moreover, we extend the studied function class by removing\nboundedness assumptions on the gradient norm. Our main technical contribution\nis a unified proof for asynchronous federated learning with bounded staleness\nthat we apply to MAML and ME personalization frameworks. For the smooth and\nnon-convex functions class, we show the convergence of our method to a\nfirst-order stationary point. We illustrate the performance of our method and\nits tolerance to staleness through experiments for classification tasks over\nheterogeneous datasets.\n","authors":["Mohammad Taha Toghani","Soomin Lee","César A. Uribe"],"pdf_url":"https://arxiv.org/pdf/2210.01176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05109v2","updated":"2023-10-04T14:33:47Z","published":"2023-01-12T16:06:06Z","title":"Explaining $\\mathcal{ELH}$ Concept Descriptions through Counterfactual\n  Reasoning","summary":"  Knowledge bases are widely used for information management, enabling\nhigh-impact applications such as web search, question answering, and natural\nlanguage processing. They also serve as the backbone for automatic decision\nsystems, e.g., for medical diagnostics and credit scoring. As stakeholders\naffected by these decisions would like to understand their situation and verify\nhow fair the decisions are, a number of explanation approaches have been\nproposed. An intrinsically transparent way to do classification is by using\nconcepts in description logics. However, these concepts can become long and\ndifficult to fathom for non-experts, even when verbalized. One solution is to\nemploy counterfactuals to answer the question, ``How must feature values be\nchanged to obtain a different classification?'' By focusing on the minimal\nfeature changes, the explanations are short, human-friendly, and provide a\nclear path of action regarding the change in prediction. While previous work\ninvestigated counterfactuals for tabular data, in this paper, we transfer the\nnotion of counterfactuals to knowledge bases and the description logic\n$\\mathcal{ELH}$. Our approach starts by generating counterfactual candidates\nfrom concepts, followed by selecting the candidates requiring the fewest\nfeature changes as counterfactuals. When multiple counterfactuals exist, we\nrank them based on the likeliness of their feature combinations. We evaluate\nour method by conducting a user survey to determine which counterfactual\ncandidates participants prefer for explanation.\n","authors":["Leonie Nora Sieger","Stefan Heindorf","Yasir Mahmood","Lukas Blübaum","Axel-Cyrille Ngonga Ngomo"],"pdf_url":"https://arxiv.org/pdf/2301.05109v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17212v2","updated":"2023-10-04T14:28:40Z","published":"2023-05-26T19:14:01Z","title":"Rotational Equilibrium: How Weight Decay Balances Learning Across Neural\n  Networks","summary":"  Weight decay can significantly impact the optimization dynamics of deep\nneural networks. In certain situations the effects of weight decay and gradient\nupdates on the magnitude of a parameter vector cancel out on average, forming a\nstate known as equilibrium. This causes the expected rotation of the vector in\neach update to remain constant along with its magnitude. Importantly,\nequilibrium can arise independently for the weight vectors of different layers\nand neurons. These equilibria are highly homogeneous for some optimizer and\nnormalization configurations, effectively balancing the average rotation--a\nproxy for the effective learning rate--across network components. In this work\nwe explore the equilibrium states of multiple optimizers including AdamW and\nSGD with momentum, providing insights into interactions between the learning\nrate, weight decay, initialization, normalization and learning rate schedule.\nWe show how rotational equilibrium can be enforced throughout training,\neliminating the chaotic transient phase corresponding to the transition towards\nequilibrium, thus simplifying the training dynamics. Finally, we show that\nrotational behavior may play a key role in the effectiveness of AdamW compared\nto Adam with L2-regularization, the performance of different normalization\nlayers, and the need for learning rate warmup.\n","authors":["Atli Kosson","Bettina Messmer","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2305.17212v2.pdf","comment":"Code available at https://github.com/epfml/rotational-optimizers"},{"id":"http://arxiv.org/abs/2306.01776v2","updated":"2023-10-04T14:02:56Z","published":"2023-05-29T18:20:28Z","title":"From Zero to Turbulence: Generative Modeling for 3D Flow Simulation","summary":"  Simulations of turbulent flows in 3D are one of the most expensive\nsimulations in computational fluid dynamics (CFD). Many works have been written\non surrogate models to replace numerical solvers for fluid flows with faster,\nlearned, autoregressive models. However, the intricacies of turbulence in three\ndimensions necessitate training these models with very small time steps, while\ngenerating realistic flow states requires either long roll-outs with many steps\nand significant error accumulation or starting from a known, realistic flow\nstate - something we aimed to avoid in the first place. Instead, we propose to\napproach turbulent flow simulation as a generative task directly learning the\nmanifold of all possible turbulent flow states without relying on any initial\nflow state. For our experiments, we introduce a challenging 3D turbulence\ndataset of high-resolution flows and detailed vortex structures caused by\nvarious objects and derive two novel sample evaluation metrics for turbulent\nflows. On this dataset, we show that our generative model captures the\ndistribution of turbulent flows caused by unseen objects and generates\nhigh-quality, realistic samples amenable for downstream applications without\naccess to any initial state.\n","authors":["Marten Lienen","David Lüdke","Jan Hansen-Palmus","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2306.01776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02832v1","updated":"2023-10-04T13:59:45Z","published":"2023-10-04T13:59:45Z","title":"Out-of-Distribution Detection by Leveraging Between-Layer Transformation\n  Smoothness","summary":"  Effective OOD detection is crucial for reliable machine learning models, yet\nmost current methods are limited in practical use due to requirements like\naccess to training data or intervention in training. We present a novel method\nfor detecting OOD data in deep neural networks based on transformation\nsmoothness between intermediate layers of a network (BLOOD), which is\napplicable to pre-trained models without access to training data. BLOOD\nutilizes the tendency of between-layer representation transformations of\nin-distribution (ID) data to be smoother than the corresponding transformations\nof OOD data, a property that we also demonstrate empirically for Transformer\nnetworks. We evaluate BLOOD on several text classification tasks with\nTransformer networks and demonstrate that it outperforms methods with\ncomparable resource requirements. Our analysis also suggests that when learning\nsimpler tasks, OOD data transformations maintain their original sharpness,\nwhereas sharpness increases with more complex tasks.\n","authors":["Fran Jelenić","Josip Jukić","Martin Tutek","Mate Puljiz","Jan Šnajder"],"pdf_url":"https://arxiv.org/pdf/2310.02832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02823v1","updated":"2023-10-04T13:45:56Z","published":"2023-10-04T13:45:56Z","title":"Learning to Scale Logits for Temperature-Conditional GFlowNets","summary":"  GFlowNets are probabilistic models that learn a stochastic policy that\nsequentially generates compositional structures, such as molecular graphs. They\nare trained with the objective of sampling such objects with probability\nproportional to the object's reward. Among GFlowNets, the\ntemperature-conditional GFlowNets represent a family of policies indexed by\ntemperature, and each is associated with the correspondingly tempered reward\nfunction. The major benefit of temperature-conditional GFlowNets is the\ncontrollability of GFlowNets' exploration and exploitation through adjusting\ntemperature. We propose Learning to Scale Logits for temperature-conditional\nGFlowNets (LSL-GFN), a novel architectural design that greatly accelerates the\ntraining of temperature-conditional GFlowNets. It is based on the idea that\npreviously proposed temperature-conditioning approaches introduced numerical\nchallenges in the training of the deep network because different temperatures\nmay give rise to very different gradient profiles and ideal scales of the\npolicy's logits. We find that the challenge is greatly reduced if a learned\nfunction of the temperature is used to scale the policy's logits directly. We\nempirically show that our strategy dramatically improves the performances of\nGFlowNets, outperforming other baselines, including reinforcement learning and\nsampling methods, in terms of discovering diverse modes in multiple biochemical\ntasks.\n","authors":["Minsu Kim","Joohwan Ko","Dinghuai Zhang","Ling Pan","Taeyoung Yun","Woochang Kim","Jinkyoo Park","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.02823v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2211.13976v5","updated":"2023-10-04T13:37:58Z","published":"2022-11-25T09:38:22Z","title":"Expanding Small-Scale Datasets with Guided Imagination","summary":"  The power of DNNs relies heavily on the quantity and quality of training\ndata. However, collecting and annotating data on a large scale is often\nexpensive and time-consuming. To address this issue, we explore a new task,\ntermed dataset expansion, aimed at expanding a ready-to-use small dataset by\nautomatically creating new labeled samples. To this end, we present a Guided\nImagination Framework (GIF) that leverages cutting-edge generative models like\nDALL-E2 and Stable Diffusion (SD) to \"imagine\" and create informative new data\nfrom the input seed data. Specifically, GIF conducts data imagination by\noptimizing the latent features of the seed data in the semantically meaningful\nspace of the prior model, resulting in the creation of photo-realistic images\nwith new content. To guide the imagination towards creating informative samples\nfor model training, we introduce two key criteria, i.e., class-maintained\ninformation boosting and sample diversity promotion. These criteria are\nverified to be essential for effective dataset expansion: GIF-SD obtains 13.5%\nhigher model accuracy on natural image datasets than unguided expansion with\nSD. With these essential criteria, GIF successfully expands small datasets in\nvarious scenarios, boosting model accuracy by 36.9% on average over six natural\nimage datasets and by 13.5% on average over three medical datasets. The source\ncode is available at https://github.com/Vanint/DatasetExpansion.\n","authors":["Yifan Zhang","Daquan Zhou","Bryan Hooi","Kai Wang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2211.13976v5.pdf","comment":"NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion"},{"id":"http://arxiv.org/abs/2310.02812v1","updated":"2023-10-04T13:37:34Z","published":"2023-10-04T13:37:34Z","title":"Time-Series Classification in Smart Manufacturing Systems: An\n  Experimental Evaluation of State-of-the-Art Machine Learning Algorithms","summary":"  Manufacturing is gathering extensive amounts of diverse data, thanks to the\ngrowing number of sensors and rapid advances in sensing technologies. Among the\nvarious data types available in SMS settings, time-series data plays a pivotal\nrole. Hence, TSC emerges is crucial in this domain. The objective of this study\nis to fill this gap by providing a rigorous experimental evaluation of the SoTA\nML and DL algorithms for TSC tasks in manufacturing and industrial settings. We\nfirst explored and compiled a comprehensive list of more than 92 SoTA\nalgorithms from both TSC and manufacturing literature. Following, we selected\nthe 36 most representative algorithms from this list. To evaluate their\nperformance across various manufacturing classification tasks, we curated a set\nof 22 manufacturing datasets, representative of different characteristics that\ncover diverse manufacturing problems. Subsequently, we implemented and\nevaluated the algorithms on the manufacturing benchmark datasets, and analyzed\nthe results for each dataset. Based on the results, ResNet, DrCIF,\nInceptionTime, and ARSENAL are the top-performing algorithms, boasting an\naverage accuracy of over 96.6% across all 22 manufacturing TSC datasets. These\nfindings underscore the robustness, efficiency, scalability, and effectiveness\nof convolutional kernels in capturing temporal features in time-series data, as\nthree out of the top four performing algorithms leverage these kernels for\nfeature extraction. Additionally, LSTM, BiLSTM, and TS-LSTM algorithms deserve\nrecognition for their effectiveness in capturing features within time-series\ndata using RNN-based structures.\n","authors":["Mojtaba A. Farahani","M. R. McCormick","Ramy Harik","Thorsten Wuest"],"pdf_url":"https://arxiv.org/pdf/2310.02812v1.pdf","comment":"Submitted to the Journal of Manufacturing systems"},{"id":"http://arxiv.org/abs/2310.02807v1","updated":"2023-10-04T13:34:34Z","published":"2023-10-04T13:34:34Z","title":"A Deep Instance Generative Framework for MILP Solvers Under Limited Data\n  Availability","summary":"  In the past few years, there has been an explosive surge in the use of\nmachine learning (ML) techniques to address combinatorial optimization (CO)\nproblems, especially mixed-integer linear programs (MILPs). Despite the\nachievements, the limited availability of real-world instances often leads to\nsub-optimal decisions and biased solver assessments, which motivates a suite of\nsynthetic MILP instance generation techniques. However, existing methods either\nrely heavily on expert-designed formulations or struggle to capture the rich\nfeatures of real-world instances. To tackle this problem, we propose G2MILP,\nwhich to the best of our knowledge is the first deep generative framework for\nMILP instances. Specifically, G2MILP represents MILP instances as bipartite\ngraphs, and applies a masked variational autoencoder to iteratively corrupt and\nreplace parts of the original graphs to generate new ones. The appealing\nfeature of G2MILP is that it can learn to generate novel and realistic MILP\ninstances without prior expert-designed formulations, while preserving the\nstructures and computational hardness of real-world datasets, simultaneously.\nThus the generated instances can facilitate downstream tasks for enhancing MILP\nsolvers under limited data availability. We design a suite of benchmarks to\nevaluate the quality of the generated MILP instances. Experiments demonstrate\nthat our method can produce instances that closely resemble real-world datasets\nin terms of both structures and computational hardness.\n","authors":["Zijie Geng","Xijun Li","Jie Wang","Xiao Li","Yongdong Zhang","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2310.02807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02806v1","updated":"2023-10-04T13:33:37Z","published":"2023-10-04T13:33:37Z","title":"A Data-facilitated Numerical Method for Richards Equation to Model Water\n  Flow Dynamics in Soil","summary":"  Root-zone soil moisture monitoring is essential for precision agriculture,\nsmart irrigation, and drought prevention. Modeling the spatiotemporal water\nflow dynamics in soil is typically achieved by solving a hydrological model,\nsuch as the Richards equation which is a highly nonlinear partial differential\nequation (PDE). In this paper, we present a novel data-facilitated numerical\nmethod for solving the mixed-form Richards equation. This numerical method,\nwhich we call the D-GRW (Data-facilitated global Random Walk) method,\nsynergistically integrates adaptive linearization scheme, neural networks, and\nglobal random walk in a finite volume discretization framework to produce\naccurate numerical solutions of the Richards equation with guaranteed\nconvergence under reasonable assumptions. Through three illustrative examples,\nwe demonstrate and discuss the superior accuracy and mass conservation\nperformance of our D-GRW method and compare it with benchmark numerical methods\nand commercial solver.\n","authors":["Zeyuan Song","Zheyu Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.02806v1.pdf","comment":"26 pages, 11 figures, submitted to Water Resources Research"},{"id":"http://arxiv.org/abs/2310.02804v1","updated":"2023-10-04T13:29:47Z","published":"2023-10-04T13:29:47Z","title":"DOMINO: A Dual-System for Multi-step Visual Language Reasoning","summary":"  Visual language reasoning requires a system to extract text or numbers from\ninformation-dense images like charts or plots and perform logical or arithmetic\nreasoning to arrive at an answer. To tackle this task, existing work relies on\neither (1) an end-to-end vision-language model trained on a large amount of\ndata, or (2) a two-stage pipeline where a captioning model converts the image\ninto text that is further read by another large language model to deduce the\nanswer. However, the former approach forces the model to answer a complex\nquestion with one single step, and the latter approach is prone to inaccurate\nor distracting information in the converted text that can confuse the language\nmodel. In this work, we propose a dual-system for multi-step multimodal\nreasoning, which consists of a \"System-1\" step for visual information\nextraction and a \"System-2\" step for deliberate reasoning. Given an input,\nSystem-2 breaks down the question into atomic sub-steps, each guiding System-1\nto extract the information required for reasoning from the image. Experiments\non chart and plot datasets show that our method with a pre-trained System-2\nmodule performs competitively compared to prior work on in- and\nout-of-distribution data. By fine-tuning the System-2 module (LLaMA-2 70B) on\nonly a small amount of data on multi-step reasoning, the accuracy of our method\nis further improved and surpasses the best fully-supervised end-to-end approach\nby 5.7% and a pipeline approach with FlanPaLM (540B) by 7.5% on a challenging\ndataset with human-authored questions.\n","authors":["Peifang Wang","Olga Golovneva","Armen Aghajanyan","Xiang Ren","Muhao Chen","Asli Celikyilmaz","Maryam Fazel-Zarandi"],"pdf_url":"https://arxiv.org/pdf/2310.02804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00258v2","updated":"2023-10-04T13:27:37Z","published":"2023-08-01T03:41:47Z","title":"AQUILA: Communication Efficient Federated Learning with Adaptive\n  Quantization in Device Selection Strategy","summary":"  The widespread adoption of Federated Learning (FL), a privacy-preserving\ndistributed learning methodology, has been impeded by the challenge of high\ncommunication overheads, typically arising from the transmission of large-scale\nmodels. Existing adaptive quantization methods, designed to mitigate these\noverheads, operate under the impractical assumption of uniform device\nparticipation in every training round. Additionally, these methods are limited\nin their adaptability due to the necessity of manual quantization level\nselection and often overlook biases inherent in local devices' data, thereby\naffecting the robustness of the global model. In response, this paper\nintroduces AQUILA (adaptive quantization in device selection strategy), a novel\nadaptive framework devised to effectively handle these issues, enhancing the\nefficiency and robustness of FL. AQUILA integrates a sophisticated device\nselection method that prioritizes the quality and usefulness of device updates.\nUtilizing the exact global model stored by devices, it enables a more precise\ndevice selection criterion, reduces model deviation, and limits the need for\nhyperparameter adjustments. Furthermore, AQUILA presents an innovative\nquantization criterion, optimized to improve communication efficiency while\nassuring model convergence. Our experiments demonstrate that AQUILA\nsignificantly decreases communication costs compared to existing methods, while\nmaintaining comparable model performance across diverse non-homogeneous FL\nsettings, such as Non-IID data and heterogeneous model architectures.\n","authors":["Zihao Zhao","Yuzhu Mao","Zhenpeng Shi","Yang Liu","Tian Lan","Wenbo Ding","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06256v2","updated":"2023-10-04T13:26:06Z","published":"2023-09-12T14:16:54Z","title":"Speciality vs Generality: An Empirical Study on Catastrophic Forgetting\n  in Fine-tuning Foundation Models","summary":"  Foundation models, including Vision Language Models (VLMs) and Large Language\nModels (LLMs), possess the $generality$ to handle diverse distributions and\ntasks, which stems from their extensive pre-training datasets. The fine-tuning\nof foundation models is a common practice to enhance task performance or align\nthe model's behavior with human expectations, allowing them to gain\n$speciality$. However, the small datasets used for fine-tuning may not\nadequately cover the diverse distributions and tasks encountered during\npre-training. Consequently, the pursuit of speciality during fine-tuning can\nlead to a loss of {generality} in the model, which is related to catastrophic\nforgetting (CF) in deep learning. In this study, we demonstrate this phenomenon\nin both VLMs and LLMs. For instance, fine-tuning VLMs like CLIP on ImageNet\nresults in a loss of generality in handling diverse distributions, and\nfine-tuning LLMs like Galactica in the medical domain leads to a loss in\nfollowing instructions and common sense.\n  To address the trade-off between the speciality and generality, we\ninvestigate multiple regularization methods from continual learning, the weight\naveraging method (Wise-FT) from out-of-distributional (OOD) generalization,\nwhich interpolates parameters between pre-trained and fine-tuned models, and\nparameter-efficient fine-tuning methods like Low-Rank Adaptation (LoRA). Our\nfindings show that both continual learning and Wise-ft methods effectively\nmitigate the loss of generality, with Wise-FT exhibiting the strongest\nperformance in balancing speciality and generality.\n","authors":["Yong Lin","Lu Tan","Hangyu Lin","Zeming Zheng","Renjie Pi","Jipeng Zhang","Shizhe Diao","Haoxiang Wang","Han Zhao","Yuan Yao","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.06256v2.pdf","comment":"30 Pages"},{"id":"http://arxiv.org/abs/2303.12959v2","updated":"2023-10-04T13:20:06Z","published":"2023-03-22T23:38:10Z","title":"Variantional autoencoder with decremental information bottleneck for\n  disentanglement","summary":"  One major challenge of disentanglement learning with variational autoencoders\nis the trade-off between disentanglement and reconstruction fidelity. Previous\nstudies, which increase the information bottleneck during training, tend to\nlose the constraint of disentanglement, leading to the information diffusion\nproblem. In this paper, we present a novel framework for disentangled\nrepresentation learning, DeVAE, which utilizes hierarchical latent spaces with\ndecreasing information bottlenecks across these spaces. The key innovation of\nour approach lies in connecting the hierarchical latent spaces through\ndisentanglement-invariant transformations, allowing the sharing of\ndisentanglement properties among spaces while maintaining an acceptable level\nof reconstruction performance. We demonstrate the effectiveness of DeVAE in\nachieving a balance between disentanglement and reconstruction through a series\nof experiments and ablation studies on dSprites and Shapes3D datasets. Code is\navailable at https://github.com/erow/disentanglement_lib/tree/pytorch#devae.\n","authors":["Jiantao Wu","Shentong Mo","Xiang Yang","Muhammad Awais","Sara Atito","Xingshen Zhang","Lin Wang","Xiang Yang"],"pdf_url":"https://arxiv.org/pdf/2303.12959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08600v3","updated":"2023-10-04T13:17:38Z","published":"2023-09-15T17:56:55Z","title":"Sparse Autoencoders Find Highly Interpretable Features in Language\n  Models","summary":"  One of the roadblocks to a better understanding of neural networks' internals\nis \\textit{polysemanticity}, where neurons appear to activate in multiple,\nsemantically distinct contexts. Polysemanticity prevents us from identifying\nconcise, human-understandable explanations for what neural networks are doing\ninternally. One hypothesised cause of polysemanticity is\n\\textit{superposition}, where neural networks represent more features than they\nhave neurons by assigning features to an overcomplete set of directions in\nactivation space, rather than to individual neurons. Here, we attempt to\nidentify those directions, using sparse autoencoders to reconstruct the\ninternal activations of a language model. These autoencoders learn sets of\nsparsely activating features that are more interpretable and monosemantic than\ndirections identified by alternative approaches, where interpretability is\nmeasured by automated methods. Moreover, we show that with our learned set of\nfeatures, we can pinpoint the features that are causally responsible for\ncounterfactual behaviour on the indirect object identification task\n\\citep{wang2022interpretability} to a finer degree than previous\ndecompositions. This work indicates that it is possible to resolve\nsuperposition in language models using a scalable, unsupervised method. Our\nmethod may serve as a foundation for future mechanistic interpretability work,\nwhich we hope will enable greater model transparency and steerability.\n","authors":["Hoagy Cunningham","Aidan Ewart","Logan Riggs","Robert Huben","Lee Sharkey"],"pdf_url":"https://arxiv.org/pdf/2309.08600v3.pdf","comment":"20 pages, 18 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.02784v1","updated":"2023-10-04T13:00:53Z","published":"2023-10-04T13:00:53Z","title":"MAD Max Beyond Single-Node: Enabling Large Machine Learning Model\n  Acceleration on Distributed Systems","summary":"  Training and deploying large machine learning (ML) models is time-consuming\nand requires significant distributed computing infrastructures. Based on\nreal-world large model training on datacenter-scale infrastructures, we show\n14~32% of all GPU hours are spent on communication with no overlapping\ncomputation. To minimize the outstanding communication latency, in this work,\nwe develop an agile performance modeling framework to guide parallelization and\nhardware-software co-design strategies. Using the suite of real-world large ML\nmodels on state-of-the-art GPU training hardware, we demonstrate 2.24x and\n5.27x throughput improvement potential for pre-training and inference\nscenarios, respectively.\n","authors":["Samuel Hsia","Alicia Golden","Bilge Acun-Uyan","Newsha Ardalani","Zachary DeVito","Gu-Yeon Wei","David Brooks","Carole-Jean Wu"],"pdf_url":"https://arxiv.org/pdf/2310.02784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02782v1","updated":"2023-10-04T12:52:56Z","published":"2023-10-04T12:52:56Z","title":"Discovering General Reinforcement Learning Algorithms with Adversarial\n  Environment Design","summary":"  The past decade has seen vast progress in deep reinforcement learning (RL) on\nthe back of algorithms manually designed by human researchers. Recently, it has\nbeen shown that it is possible to meta-learn update rules, with the hope of\ndiscovering algorithms that can perform well on a wide range of RL tasks.\nDespite impressive initial results from algorithms such as Learned Policy\nGradient (LPG), there remains a generalization gap when these algorithms are\napplied to unseen environments. In this work, we examine how characteristics of\nthe meta-training distribution impact the generalization performance of these\nalgorithms. Motivated by this analysis and building on ideas from Unsupervised\nEnvironment Design (UED), we propose a novel approach for automatically\ngenerating curricula to maximize the regret of a meta-learned optimizer, in\naddition to a novel approximation of regret, which we name algorithmic regret\n(AR). The result is our method, General RL Optimizers Obtained Via Environment\nDesign (GROOVE). In a series of experiments, we show that GROOVE achieves\nsuperior generalization to LPG, and evaluate AR against baseline metrics from\nUED, identifying it as a critical component of environment design in this\nsetting. We believe this approach is a step towards the discovery of truly\ngeneral RL algorithms, capable of solving a wide range of real-world\nenvironments.\n","authors":["Matthew Thomas Jackson","Minqi Jiang","Jack Parker-Holder","Risto Vuorio","Chris Lu","Gregory Farquhar","Shimon Whiteson","Jakob Nicolaus Foerster"],"pdf_url":"https://arxiv.org/pdf/2310.02782v1.pdf","comment":"Published at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.11589v2","updated":"2023-10-04T12:52:52Z","published":"2023-05-19T10:58:12Z","title":"Vision-based DRL Autonomous Driving Agent with Sim2Real Transfer","summary":"  To achieve fully autonomous driving, vehicles must be capable of continuously\nperforming various driving tasks, including lane keeping and car following,\nboth of which are fundamental and well-studied driving ones. However, previous\nstudies have mainly focused on individual tasks, and car following tasks have\ntypically relied on complete leader-follower information to attain optimal\nperformance. To address this limitation, we propose a vision-based deep\nreinforcement learning (DRL) agent that can simultaneously perform lane keeping\nand car following maneuvers. To evaluate the performance of our DRL agent, we\ncompare it with a baseline controller and use various performance metrics for\nquantitative analysis. Furthermore, we conduct a real-world evaluation to\ndemonstrate the Sim2Real transfer capability of the trained DRL agent. To the\nbest of our knowledge, our vision-based car following and lane keeping agent\nwith Sim2Real transfer capability is the first of its kind.\n","authors":["Dianzhao Li","Ostap Okhrin"],"pdf_url":"https://arxiv.org/pdf/2305.11589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02779v1","updated":"2023-10-04T12:50:29Z","published":"2023-10-04T12:50:29Z","title":"Expected flow networks in stochastic environments and two-player\n  zero-sum games","summary":"  Generative flow networks (GFlowNets) are sequential sampling models trained\nto match a given distribution. GFlowNets have been successfully applied to\nvarious structured object generation tasks, sampling a diverse set of\nhigh-reward objects quickly. We propose expected flow networks (EFlowNets),\nwhich extend GFlowNets to stochastic environments. We show that EFlowNets\noutperform other GFlowNet formulations in stochastic tasks such as protein\ndesign. We then extend the concept of EFlowNets to adversarial environments,\nproposing adversarial flow networks (AFlowNets) for two-player zero-sum games.\nWe show that AFlowNets learn to find above 80% of optimal moves in Connect-4\nvia self-play and outperform AlphaZero in tournaments.\n","authors":["Marco Jiralerspong","Bilun Sun","Danilo Vucetic","Tianyu Zhang","Yoshua Bengio","Gauthier Gidel","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2310.02779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02774v1","updated":"2023-10-04T12:43:38Z","published":"2023-10-04T12:43:38Z","title":"Graph Neural Networks and Time Series as Directed Graphs for Quality\n  Recognition","summary":"  Graph Neural Networks (GNNs) are becoming central in the study of time\nseries, coupled with existing algorithms as Temporal Convolutional Networks and\nRecurrent Neural Networks. In this paper, we see time series themselves as\ndirected graphs, so that their topology encodes time dependencies and we start\nto explore the effectiveness of GNNs architectures on them. We develop two\ndistinct Geometric Deep Learning models, a supervised classifier and an\nautoencoder-like model for signal reconstruction. We apply these models on a\nquality recognition problem.\n","authors":["Angelica Simonetti","Ferdinando Zanchetta"],"pdf_url":"https://arxiv.org/pdf/2310.02774v1.pdf","comment":"11 pages, Comments Welcome!"},{"id":"http://arxiv.org/abs/2306.15963v2","updated":"2023-10-04T12:33:08Z","published":"2023-06-28T07:00:12Z","title":"Fused Gromov-Wasserstein Graph Mixup for Graph-level Classifications","summary":"  Graph data augmentation has shown superiority in enhancing generalizability\nand robustness of GNNs in graph-level classifications. However, existing\nmethods primarily focus on the augmentation in the graph signal space and the\ngraph structure space independently, neglecting the joint interaction between\nthem. In this paper, we address this limitation by formulating the problem as\nan optimal transport problem that aims to find an optimal inter-graph node\nmatching strategy considering the interactions between graph structures and\nsignals. To solve this problem, we propose a novel graph mixup algorithm called\nFGWMixup, which seeks a midpoint of source graphs in the Fused\nGromov-Wasserstein (FGW) metric space. To enhance the scalability of our\nmethod, we introduce a relaxed FGW solver that accelerates FGWMixup by\nimproving the convergence rate from $\\mathcal{O}(t^{-1})$ to\n$\\mathcal{O}(t^{-2})$. Extensive experiments conducted on five datasets using\nboth classic (MPNNs) and advanced (Graphormers) GNN backbones demonstrate that\nFGWMixup effectively improves the generalizability and robustness of GNNs.\nCodes are available at https://github.com/ArthurLeoM/FGWMixup.\n","authors":["Xinyu Ma","Xu Chu","Yasha Wang","Yang Lin","Junfeng Zhao","Liantao Ma","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2306.15963v2.pdf","comment":"Accepted in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.02767v1","updated":"2023-10-04T12:31:31Z","published":"2023-10-04T12:31:31Z","title":"Kernel-based function learning in dynamic and non stationary\n  environments","summary":"  One central theme in machine learning is function estimation from sparse and\nnoisy data. An example is supervised learning where the elements of the\ntraining set are couples, each containing an input location and an output\nresponse. In the last decades, a substantial amount of work has been devoted to\ndesign estimators for the unknown function and to study their convergence to\nthe optimal predictor, also characterizing the learning rate. These results\ntypically rely on stationary assumptions where input locations are drawn from a\nprobability distribution that does not change in time. In this work, we\nconsider kernel-based ridge regression and derive convergence conditions under\nnon stationary distributions, addressing also cases where stochastic adaption\nmay happen infinitely often. This includes the important\nexploration-exploitation problems where e.g. a set of agents/robots has to\nmonitor an environment to reconstruct a sensorial field and their movements\nrules are continuously updated on the basis of the acquired knowledge on the\nfield and/or the surrounding environment.\n","authors":["Alberto Giaretta","Mauro Bisiacco","Gianluigi Pillonetto"],"pdf_url":"https://arxiv.org/pdf/2310.02767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08099v3","updated":"2023-10-04T12:15:56Z","published":"2023-05-14T08:26:24Z","title":"Self-supervised Neural Factor Analysis for Disentangling Utterance-level\n  Speech Representations","summary":"  Self-supervised learning (SSL) speech models such as wav2vec and HuBERT have\ndemonstrated state-of-the-art performance on automatic speech recognition (ASR)\nand proved to be extremely useful in low label-resource settings. However, the\nsuccess of SSL models has yet to transfer to utterance-level tasks such as\nspeaker, emotion, and language recognition, which still require supervised\nfine-tuning of the SSL models to obtain good performance. We argue that the\nproblem is caused by the lack of disentangled representations and an\nutterance-level learning objective for these tasks. Inspired by how HuBERT uses\nclustering to discover hidden acoustic units, we formulate a factor analysis\n(FA) model that uses the discovered hidden acoustic units to align the SSL\nfeatures. The underlying utterance-level representations are disentangled from\nthe content of speech using probabilistic inference on the aligned features.\nFurthermore, the variational lower bound derived from the FA model provides an\nutterance-level objective, allowing error gradients to be backpropagated to the\nTransformer layers to learn highly discriminative acoustic units. When used in\nconjunction with HuBERT's masked prediction training, our models outperform the\ncurrent best model, WavLM, on all utterance-level non-semantic tasks on the\nSUPERB benchmark with only 20% of labeled data.\n","authors":["Weiwei Lin","Chenhang He","Man-Wai Mak","Youzhi Tu"],"pdf_url":"https://arxiv.org/pdf/2305.08099v3.pdf","comment":"accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2310.02759v1","updated":"2023-10-04T12:14:43Z","published":"2023-10-04T12:14:43Z","title":"Comparative Study and Framework for Automated Summariser Evaluation:\n  LangChain and Hybrid Algorithms","summary":"  Automated Essay Score (AES) is proven to be one of the cutting-edge\ntechnologies. Scoring techniques are used for various purposes. Reliable scores\nare calculated based on influential variables. Such variables can be computed\nby different methods based on the domain. The research is concentrated on the\nuser's understanding of a given topic. The analysis is based on a scoring index\nby using Large Language Models. The user can then compare and contrast the\nunderstanding of a topic that they recently learned. The results are then\ncontributed towards learning analytics and progression is made for enhancing\nthe learning ability. In this research, the focus is on summarizing a PDF\ndocument and gauging a user's understanding of its content. The process\ninvolves utilizing a Langchain tool to summarize the PDF and extract the\nessential information. By employing this technique, the research aims to\ndetermine how well the user comprehends the summarized content.\n","authors":["Bagiya Lakshmi S","Sanjjushri Varshini R","Rohith Mahadevan","Raja CSP Raman"],"pdf_url":"https://arxiv.org/pdf/2310.02759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15521v2","updated":"2023-10-04T11:54:08Z","published":"2023-09-27T09:39:45Z","title":"MLOps for Scarce Image Data: A Use Case in Microscopic Image Analysis","summary":"  Nowadays, Machine Learning (ML) is experiencing tremendous popularity that\nhas never been seen before. The operationalization of ML models is governed by\na set of concepts and methods referred to as Machine Learning Operations\n(MLOps). Nevertheless, researchers, as well as professionals, often focus more\non the automation aspect and neglect the continuous deployment and monitoring\naspects of MLOps. As a result, there is a lack of continuous learning through\nthe flow of feedback from production to development, causing unexpected model\ndeterioration over time due to concept drifts, particularly when dealing with\nscarce data. This work explores the complete application of MLOps in the\ncontext of scarce data analysis. The paper proposes a new holistic approach to\nenhance biomedical image analysis. Our method includes: a fingerprinting\nprocess that enables selecting the best models, datasets, and model development\nstrategy relative to the image analysis task at hand; an automated model\ndevelopment stage; and a continuous deployment and monitoring process to ensure\ncontinuous learning. For preliminary results, we perform a proof of concept for\nfingerprinting in microscopic image datasets.\n","authors":["Angelo Yamachui Sitcheu","Nils Friederich","Simon Baeuerle","Oliver Neumann","Markus Reischl","Ralf Mikut"],"pdf_url":"https://arxiv.org/pdf/2309.15521v2.pdf","comment":"21 pages, 5 figures , 33. Workshop on Computational Intelligence\n  Berlin Germany"},{"id":"http://arxiv.org/abs/2306.05880v3","updated":"2023-10-04T11:48:56Z","published":"2023-06-09T13:20:04Z","title":"Time Series Continuous Modeling for Imputation and Forecasting with\n  Implicit Neural Representations","summary":"  We introduce a novel modeling approach for time series imputation and\nforecasting, tailored to address the challenges often encountered in real-world\ndata, such as irregular samples, missing data, or unaligned measurements from\nmultiple sensors. Our method relies on a continuous-time-dependent model of the\nseries' evolution dynamics. It leverages adaptations of conditional, implicit\nneural representations for sequential data. A modulation mechanism, driven by a\nmeta-learning algorithm, allows adaptation to unseen samples and extrapolation\nbeyond observed time-windows for long-term predictions. The model provides a\nhighly flexible and unified framework for imputation and forecasting tasks\nacross a wide range of challenging scenarios. It achieves state-of-the-art\nperformance on classical benchmarks and outperforms alternative time-continuous\nmodels.\n","authors":["Etienne Le Naour","Louis Serrano","Léon Migus","Yuan Yin","Ghislain Agoua","Nicolas Baskiotis","Patrick Gallinari","Vincent Guigue"],"pdf_url":"https://arxiv.org/pdf/2306.05880v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02753v1","updated":"2023-10-04T11:44:20Z","published":"2023-10-04T11:44:20Z","title":"MUNCH: Modelling Unique 'N Controllable Heads","summary":"  The automated generation of 3D human heads has been an intriguing and\nchallenging task for computer vision researchers. Prevailing methods synthesize\nrealistic avatars but with limited control over the diversity and quality of\nrendered outputs and suffer from limited correlation between shape and texture\nof the character. We propose a method that offers quality, diversity, control,\nand realism along with explainable network design, all desirable features to\ngame-design artists in the domain. First, our proposed Geometry Generator\nidentifies disentangled latent directions and generate novel and diverse\nsamples. A Render Map Generator then learns to synthesize multiply high-fidelty\nphysically-based render maps including Albedo, Glossiness, Specular, and\nNormals. For artists preferring fine-grained control over the output, we\nintroduce a novel Color Transformer Model that allows semantic color control\nover generated maps. We also introduce quantifiable metrics called Uniqueness\nand Novelty and a combined metric to test the overall performance of our model.\nDemo for both shapes and textures can be found:\nhttps://munch-seven.vercel.app/. We will release our model along with the\nsynthetic dataset.\n","authors":["Debayan Deb","Suvidha Tripathi","Pranit Puri"],"pdf_url":"https://arxiv.org/pdf/2310.02753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02752v1","updated":"2023-10-04T11:43:11Z","published":"2023-10-04T11:43:11Z","title":"Fair Feature Selection: A Comparison of Multi-Objective Genetic\n  Algorithms","summary":"  Machine learning classifiers are widely used to make decisions with a major\nimpact on people's lives (e.g. accepting or denying a loan, hiring decisions,\netc). In such applications,the learned classifiers need to be both accurate and\nfair with respect to different groups of people, with different values of\nvariables such as sex and race. This paper focuses on fair feature selection\nfor classification, i.e. methods that select a feature subset aimed at\nmaximising both the accuracy and the fairness of the predictions made by a\nclassifier. More specifically, we compare two recently proposed Genetic\nAlgorithms (GAs) for fair feature selection that are based on two different\nmulti-objective optimisation approaches: (a) a Pareto dominance-based GA; and\n(b) a lexicographic optimisation-based GA, where maximising accuracy has higher\npriority than maximising fairness. Both GAs use the same measures of accuracy\nand fairness, allowing for a controlled comparison. As far as we know, this is\nthe first comparison between the Pareto and lexicographic approaches for fair\nclassification. The results show that, overall, the lexicographic GA\noutperformed the Pareto GA with respect to accuracy without degradation of the\nfairness of the learned classifiers. This is an important result because at\npresent nearly all GAs for fair classification are based on the Pareto\napproach, so these results suggest a promising new direction for research in\nthis area.\n","authors":["James Brookhouse","Alex Freitas"],"pdf_url":"https://arxiv.org/pdf/2310.02752v1.pdf","comment":"10 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2310.02751v1","updated":"2023-10-04T11:43:08Z","published":"2023-10-04T11:43:08Z","title":"SHOT: Suppressing the Hessian along the Optimization Trajectory for\n  Gradient-Based Meta-Learning","summary":"  In this paper, we hypothesize that gradient-based meta-learning (GBML)\nimplicitly suppresses the Hessian along the optimization trajectory in the\ninner loop. Based on this hypothesis, we introduce an algorithm called SHOT\n(Suppressing the Hessian along the Optimization Trajectory) that minimizes the\ndistance between the parameters of the target and reference models to suppress\nthe Hessian in the inner loop. Despite dealing with high-order terms, SHOT does\nnot increase the computational complexity of the baseline model much. It is\nagnostic to both the algorithm and architecture used in GBML, making it highly\nversatile and applicable to any GBML baseline. To validate the effectiveness of\nSHOT, we conduct empirical tests on standard few-shot learning tasks and\nqualitatively analyze its dynamics. We confirm our hypothesis empirically and\ndemonstrate that SHOT outperforms the corresponding baseline. Code is available\nat: https://github.com/JunHoo-Lee/SHOT\n","authors":["JunHoo Lee","Jayeon Yoo","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2310.02751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02744v1","updated":"2023-10-04T11:34:46Z","published":"2023-10-04T11:34:46Z","title":"SALSA: Semantically-Aware Latent Space Autoencoder","summary":"  In deep learning for drug discovery, chemical data are often represented as\nsimplified molecular-input line-entry system (SMILES) sequences which allow for\nstraightforward implementation of natural language processing methodologies,\none being the sequence-to-sequence autoencoder. However, we observe that\ntraining an autoencoder solely on SMILES is insufficient to learn molecular\nrepresentations that are semantically meaningful, where semantics are defined\nby the structural (graph-to-graph) similarities between molecules. We\ndemonstrate by example that autoencoders may map structurally similar molecules\nto distant codes, resulting in an incoherent latent space that does not respect\nthe structural similarities between molecules. To address this shortcoming we\npropose Semantically-Aware Latent Space Autoencoder (SALSA), a\ntransformer-autoencoder modified with a contrastive task, tailored specifically\nto learn graph-to-graph similarity between molecules. Formally, the contrastive\nobjective is to map structurally similar molecules (separated by a single graph\nedit) to nearby codes in the latent space. To accomplish this, we generate a\nnovel dataset comprised of sets of structurally similar molecules and opt for a\nsupervised contrastive loss that is able to incorporate full sets of positive\nsamples. We compare SALSA to its ablated counterparts, and show empirically\nthat the composed training objective (reconstruction and contrastive task)\nleads to a higher quality latent space that is more 1) structurally-aware, 2)\nsemantically continuous, and 3) property-aware.\n","authors":["Kathryn E. Kirchoff","Travis Maxfield","Alexander Tropsha","Shawn M. Gomez"],"pdf_url":"https://arxiv.org/pdf/2310.02744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02743v1","updated":"2023-10-04T11:34:22Z","published":"2023-10-04T11:34:22Z","title":"Reward Model Ensembles Help Mitigate Overoptimization","summary":"  Reinforcement learning from human feedback (RLHF) is a standard approach for\nfine-tuning large language models to follow instructions. As part of this\nprocess, learned reward models are used to approximately model human\npreferences. However, as imperfect representations of the \"true\" reward, these\nlearned reward models are susceptible to \\textit{overoptimization}. Gao et al.\n(2023) studied this phenomenon in a synthetic human feedback setup with a\nsignificantly larger \"gold\" reward model acting as the true reward (instead of\nhumans) and showed that overoptimization remains a persistent problem\nregardless of the size of the proxy reward model and training data used. Using\na similar setup, we conduct a systematic study to evaluate the efficacy of\nusing ensemble-based conservative optimization objectives, specifically\nworst-case optimization (WCO) and uncertainty-weighted optimization (UWO), for\nmitigating reward model overoptimization when using two optimization methods:\n(a) best-of-n sampling (BoN) (b) proximal policy optimization (PPO). We\nadditionally extend the setup of Gao et al. (2023) to include 25% label noise\nto better mirror real-world conditions. Both with and without label noise, we\nfind that conservative optimization practically eliminates overoptimization and\nimproves performance by up to 70% for BoN sampling. For PPO, ensemble-based\nconservative optimization always reduces overoptimization and outperforms\nsingle reward model optimization. Moreover, combining it with a small KL\npenalty successfully prevents overoptimization at no performance cost. Overall,\nour results demonstrate that ensemble-based conservative optimization can\neffectively counter overoptimization.\n","authors":["Thomas Coste","Usman Anwar","Robert Kirk","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2310.02743v1.pdf","comment":"9 pages, 12 figures (excluding appendix). Submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.02742v1","updated":"2023-10-04T11:33:36Z","published":"2023-10-04T11:33:36Z","title":"Comparative Analysis of Imbalanced Malware Byteplot Image Classification\n  using Transfer Learning","summary":"  Cybersecurity is a major concern due to the increasing reliance on technology\nand interconnected systems. Malware detectors help mitigate cyber-attacks by\ncomparing malware signatures. Machine learning can improve these detectors by\nautomating feature extraction, identifying patterns, and enhancing dynamic\nanalysis. In this paper, the performance of six multiclass classification\nmodels is compared on the Malimg dataset, Blended dataset, and Malevis dataset\nto gain insights into the effect of class imbalance on model performance and\nconvergence. It is observed that the more the class imbalance less the number\nof epochs required for convergence and a high variance across the performance\nof different models. Moreover, it is also observed that for malware detectors\nResNet50, EfficientNetB0, and DenseNet169 can handle imbalanced and balanced\ndata well. A maximum precision of 97% is obtained for the imbalanced dataset, a\nmaximum precision of 95% is obtained on the intermediate imbalance dataset, and\na maximum precision of 95% is obtained for the perfectly balanced dataset.\n","authors":["Jayasudha M","Ayesha Shaik","Gaurav Pendharkar","Soham Kumar","Muhesh Kumar B","Sudharshanan Balaji"],"pdf_url":"https://arxiv.org/pdf/2310.02742v1.pdf","comment":"accepted at PEIS2023 and will be published in Lecture Notes in\n  Electrical Engineering"},{"id":"http://arxiv.org/abs/2310.02735v1","updated":"2023-10-04T11:14:51Z","published":"2023-10-04T11:14:51Z","title":"Extracting Rules from Event Data for Study Planning","summary":"  In this study, we examine how event data from campus management systems can\nbe used to analyze the study paths of higher education students. The main goal\nis to offer valuable guidance for their study planning. We employ process and\ndata mining techniques to explore the impact of sequences of taken courses on\nacademic success. Through the use of decision tree models, we generate\ndata-driven recommendations in the form of rules for study planning and compare\nthem to the recommended study plan. The evaluation focuses on RWTH Aachen\nUniversity computer science bachelor program students and demonstrates that the\nproposed course sequence features effectively explain academic performance\nmeasures. Furthermore, the findings suggest avenues for developing more\nadaptable study plans.\n","authors":["Majid Rafiei","Duygu Bayrak","Mahsa Pourbafrani","Gyunam Park","Hayyan Helal","Gerhard Lakemeyer","Wil M. P. van der Aalst"],"pdf_url":"https://arxiv.org/pdf/2310.02735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07426v2","updated":"2023-10-04T11:11:59Z","published":"2023-02-15T02:00:26Z","title":"Computational Complexity of Learning Neural Networks: Smoothness and\n  Degeneracy","summary":"  Understanding when neural networks can be learned efficiently is a\nfundamental question in learning theory. Existing hardness results suggest that\nassumptions on both the input distribution and the network's weights are\nnecessary for obtaining efficient algorithms. Moreover, it was previously shown\nthat depth-$2$ networks can be efficiently learned under the assumptions that\nthe input distribution is Gaussian, and the weight matrix is non-degenerate. In\nthis work, we study whether such assumptions may suffice for learning deeper\nnetworks and prove negative results. We show that learning depth-$3$ ReLU\nnetworks under the Gaussian input distribution is hard even in the\nsmoothed-analysis framework, where a random noise is added to the network's\nparameters. It implies that learning depth-$3$ ReLU networks under the Gaussian\ndistribution is hard even if the weight matrices are non-degenerate. Moreover,\nwe consider depth-$2$ networks, and show hardness of learning in the\nsmoothed-analysis framework, where both the network parameters and the input\ndistribution are smoothed. Our hardness results are under a well-studied\nassumption on the existence of local pseudorandom generators.\n","authors":["Amit Daniely","Nathan Srebro","Gal Vardi"],"pdf_url":"https://arxiv.org/pdf/2302.07426v2.pdf","comment":"Changed the title, and made some other minor modifications. arXiv\n  admin note: text overlap with arXiv:2101.08303"},{"id":"http://arxiv.org/abs/2310.02727v1","updated":"2023-10-04T11:07:52Z","published":"2023-10-04T11:07:52Z","title":"Functional trustworthiness of AI systems by statistically valid testing","summary":"  The authors are concerned about the safety, health, and rights of the\nEuropean citizens due to inadequate measures and procedures required by the\ncurrent draft of the EU Artificial Intelligence (AI) Act for the conformity\nassessment of AI systems. We observe that not only the current draft of the EU\nAI Act, but also the accompanying standardization efforts in CEN/CENELEC, have\nresorted to the position that real functional guarantees of AI systems\nsupposedly would be unrealistic and too complex anyways. Yet enacting a\nconformity assessment procedure that creates the false illusion of trust in\ninsufficiently assessed AI systems is at best naive and at worst grossly\nnegligent. The EU AI Act thus misses the point of ensuring quality by\nfunctional trustworthiness and correctly attributing responsibilities.\n  The trustworthiness of an AI decision system lies first and foremost in the\ncorrect statistical testing on randomly selected samples and in the precision\nof the definition of the application domain, which enables drawing samples in\nthe first place. We will subsequently call this testable quality functional\ntrustworthiness. It includes a design, development, and deployment that enables\ncorrect statistical testing of all relevant functions.\n  We are firmly convinced and advocate that a reliable assessment of the\nstatistical functional properties of an AI system has to be the indispensable,\nmandatory nucleus of the conformity assessment. In this paper, we describe the\nthree necessary elements to establish a reliable functional trustworthiness,\ni.e., (1) the definition of the technical distribution of the application, (2)\nthe risk-based minimum performance requirements, and (3) the statistically\nvalid testing based on independent random samples.\n","authors":["Bernhard Nessler","Thomas Doms","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2310.02727v1.pdf","comment":"Position paper to the current regulation and standardization effort\n  of AI in Europe"},{"id":"http://arxiv.org/abs/2310.02724v1","updated":"2023-10-04T10:56:00Z","published":"2023-10-04T10:56:00Z","title":"End-to-End Training of a Neural HMM with Label and Transition\n  Probabilities","summary":"  We investigate a novel modeling approach for end-to-end neural network\ntraining using hidden Markov models (HMM) where the transition probabilities\nbetween hidden states are modeled and learned explicitly. Most contemporary\nsequence-to-sequence models allow for from-scratch training by summing over all\npossible label segmentations in a given topology. In our approach there are\nexplicit, learnable probabilities for transitions between segments as opposed\nto a blank label that implicitly encodes duration statistics. We implement a\nGPU-based forward-backward algorithm that enables the simultaneous training of\nlabel and transition probabilities. We investigate recognition results and\nadditionally Viterbi alignments of our models. We find that while the\ntransition model training does not improve recognition performance, it has a\npositive impact on the alignment quality. The generated alignments are shown to\nbe viable targets in state-of-the-art Viterbi trainings.\n","authors":["Daniel Mann","Tina Raissi","Wilfried Michel","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2310.02724v1.pdf","comment":"Accepted for Presentation at ASRU2023"},{"id":"http://arxiv.org/abs/2310.02721v1","updated":"2023-10-04T10:52:51Z","published":"2023-10-04T10:52:51Z","title":"Leveraging Temporal Graph Networks Using Module Decoupling","summary":"  Modern approaches for learning on dynamic graphs have adopted the use of\nbatches instead of applying updates one by one. The use of batches allows these\ntechniques to become helpful in streaming scenarios where updates to graphs are\nreceived at extreme speeds. Using batches, however, forces the models to update\ninfrequently, which results in the degradation of their performance. In this\nwork, we suggest a decoupling strategy that enables the models to update\nfrequently while using batches. By decoupling the core modules of temporal\ngraph networks and implementing them using a minimal number of learnable\nparameters, we have developed the Lightweight Decoupled Temporal Graph Network\n(LDTGN), an exceptionally efficient model for learning on dynamic graphs. LDTG\nwas validated on various dynamic graph benchmarks, providing comparable or\nstate-of-the-art results with significantly higher throughput than previous\nart. Notably, our method outperforms previous approaches by more than 20\\% on\nbenchmarks that require rapid model update rates, such as USLegis or UNTrade.\nThe code to reproduce our experiments is available at\n\\href{https://orfeld415.github.io/module-decoupling}{this http url}.\n","authors":["Or Feldman","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2310.02721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.05455v3","updated":"2023-10-04T10:50:38Z","published":"2021-06-10T02:00:26Z","title":"AKE-GNN: Effective Graph Learning with Adaptive Knowledge Exchange","summary":"  Graph Neural Networks (GNNs) have already been widely used in various graph\nmining tasks. However, recent works reveal that the learned weights (channels)\nin well-trained GNNs are highly redundant, which inevitably limits the\nperformance of GNNs. Instead of removing these redundant channels for\nefficiency consideration, we aim to reactivate them to enlarge the\nrepresentation capacity of GNNs for effective graph learning. In this paper, we\npropose to substitute these redundant channels with other informative channels\nto achieve this goal. We introduce a novel GNN learning framework named\nAKE-GNN, which performs the Adaptive Knowledge Exchange strategy among multiple\ngraph views generated by graph augmentations. AKE-GNN first trains multiple\nGNNs each corresponding to one graph view to obtain informative channels. Then,\nAKE-GNN iteratively exchanges redundant channels in the weight parameter matrix\nof one GNN with informative channels of another GNN in a layer-wise manner.\nAdditionally, existing GNNs can be seamlessly incorporated into our framework.\nAKE-GNN achieves superior performance compared with various baselines across a\nsuite of experiments on node classification, link prediction, and graph\nclassification. In particular, we conduct a series of experiments on 15 public\nbenchmark datasets, 8 popular GNN models, and 3 graph tasks and show that\nAKE-GNN consistently outperforms existing popular GNN models and even their\nensembles. Extensive ablation studies and analyses on knowledge exchange\nmethods validate the effectiveness of AKE-GNN.\n","authors":["Liang Zeng","Jin Xu","Zijun Yao","Yanqiao Zhu","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2106.05455v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10317v2","updated":"2023-10-04T10:47:58Z","published":"2022-08-22T13:53:13Z","title":"Latent Neural Stochastic Differential Equations for Change Point\n  Detection","summary":"  Automated analysis of complex systems based on multiple readouts remains a\nchallenge. Change point detection algorithms are aimed to locating abrupt\nchanges in the time series behaviour of a process. In this paper, we present a\nnovel change point detection algorithm based on Latent Neural Stochastic\nDifferential Equations (SDE). Our method learns a non-linear deep learning\ntransformation of the process into a latent space and estimates a SDE that\ndescribes its evolution over time. The algorithm uses the likelihood ratio of\nthe learned stochastic processes in different timestamps to find change points\nof the process. We demonstrate the detection capabilities and performance of\nour algorithm on synthetic and real-world datasets. The proposed method\noutperforms the state-of-the-art algorithms on the majority of our experiments.\n","authors":["Artem Ryzhikov","Mikhail Hushchyn","Denis Derkach"],"pdf_url":"https://arxiv.org/pdf/2208.10317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02718v1","updated":"2023-10-04T10:41:21Z","published":"2023-10-04T10:41:21Z","title":"Understanding Pan-Sharpening via Generalized Inverse","summary":"  Pan-sharpening algorithm utilizes panchromatic image and multispectral image\nto obtain a high spatial and high spectral image. However, the optimizations of\nthe algorithms are designed with different standards. We adopt the simple\nmatrix equation to describe the Pan-sharpening problem. The solution existence\ncondition and the acquirement of spectral and spatial resolution are discussed.\nA down-sampling enhancement method was introduced for better acquiring the\nspatial and spectral down-sample matrices. By the generalized inverse theory,\nwe derived two forms of general inverse matrix formulations that can correspond\nto the two prominent classes of Pan-sharpening methods, that is, component\nsubstitution and multi-resolution analysis methods. Specifically, the Gram\nSchmidt Adaptive(GSA) was proved to follow the general inverse matrix\nformulation of component substitution. A model prior to the general inverse\nmatrix of the spectral function was rendered. The theoretical errors are\nanalyzed. Synthetic experiments and real data experiments are implemented. The\nproposed methods are better and sharper than other methods qualitatively in\nboth synthetic and real experiments. The down-sample enhancement effect is\nshown of better results both quantitatively and qualitatively in real\nexperiments. The generalized inverse matrix theory help us better understand\nthe Pan-sharpening.\n","authors":["Shiqi Liu","Yutong Bai","Xinyang Han","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2310.02718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02717v1","updated":"2023-10-04T10:40:50Z","published":"2023-10-04T10:40:50Z","title":"Online Clustering of Bandits with Misspecified User Models","summary":"  The contextual linear bandit is an important online learning problem where\ngiven arm features, a learning agent selects an arm at each round to maximize\nthe cumulative rewards in the long run. A line of works, called the clustering\nof bandits (CB), utilize the collaborative effect over user preferences and\nhave shown significant improvements over classic linear bandit algorithms.\nHowever, existing CB algorithms require well-specified linear user models and\ncan fail when this critical assumption does not hold. Whether robust CB\nalgorithms can be designed for more practical scenarios with misspecified user\nmodels remains an open problem. In this paper, we are the first to present the\nimportant problem of clustering of bandits with misspecified user models\n(CBMUM), where the expected rewards in user models can be perturbed away from\nperfect linear models. We devise two robust CB algorithms, RCLUMB and RSCLUMB\n(representing the learned clustering structure with dynamic graph and sets,\nrespectively), that can accommodate the inaccurate user preference estimations\nand erroneous clustering caused by model misspecifications. We prove regret\nupper bounds of $O(\\epsilon_*T\\sqrt{md\\log T} + d\\sqrt{mT}\\log T)$ for our\nalgorithms under milder assumptions than previous CB works (notably, we move\npast a restrictive technical assumption on the distribution of the arms), which\nmatch the lower bound asymptotically in $T$ up to logarithmic factors, and also\nmatch the state-of-the-art results in several degenerate cases. The techniques\nin proving the regret caused by misclustering users are quite general and may\nbe of independent interest. Experiments on both synthetic and real-world data\nshow our outperformance over previous algorithms.\n","authors":["Zhiyong Wang","Jize Xie","Xutong Liu","Shuai Li","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2310.02717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02713v1","updated":"2023-10-04T10:30:08Z","published":"2023-10-04T10:30:08Z","title":"scHyena: Foundation Model for Full-Length Single-Cell RNA-Seq Analysis\n  in Brain","summary":"  Single-cell RNA sequencing (scRNA-seq) has made significant strides in\nunraveling the intricate cellular diversity within complex tissues. This is\nparticularly critical in the brain, presenting a greater diversity of cell\ntypes than other tissue types, to gain a deeper understanding of brain function\nwithin various cellular contexts. However, analyzing scRNA-seq data remains a\nchallenge due to inherent measurement noise stemming from dropout events and\nthe limited utilization of extensive gene expression information. In this work,\nwe introduce scHyena, a foundation model designed to address these challenges\nand enhance the accuracy of scRNA-seq analysis in the brain. Specifically,\ninspired by the recent Hyena operator, we design a novel Transformer\narchitecture called singe-cell Hyena (scHyena) that is equipped with a linear\nadaptor layer, the positional encoding via gene-embedding, and a\n{bidirectional} Hyena operator. This enables us to process full-length\nscRNA-seq data without losing any information from the raw data. In particular,\nour model learns generalizable features of cells and genes through pre-training\nscHyena using the full length of scRNA-seq data. We demonstrate the superior\nperformance of scHyena compared to other benchmark methods in downstream tasks,\nincluding cell type classification and scRNA-seq imputation.\n","authors":["Gyutaek Oh","Baekgyu Choi","Inkyung Jung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2310.02713v1.pdf","comment":"21 pages, 16 figures"},{"id":"http://arxiv.org/abs/2310.02712v1","updated":"2023-10-04T10:28:38Z","published":"2023-10-04T10:28:38Z","title":"ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space\n  NeRF","summary":"  Recently, there has been a significant advancement in text-to-image diffusion\nmodels, leading to groundbreaking performance in 2D image generation. These\nadvancements have been extended to 3D models, enabling the generation of novel\n3D objects from textual descriptions. This has evolved into NeRF editing\nmethods, which allow the manipulation of existing 3D objects through textual\nconditioning. However, existing NeRF editing techniques have faced limitations\nin their performance due to slow training speeds and the use of loss functions\nthat do not adequately consider editing. To address this, here we present a\nnovel 3D NeRF editing approach dubbed ED-NeRF by successfully embedding\nreal-world scenes into the latent space of the latent diffusion model (LDM)\nthrough a unique refinement layer. This approach enables us to obtain a NeRF\nbackbone that is not only faster but also more amenable to editing compared to\ntraditional image space NeRF editing. Furthermore, we propose an improved loss\nfunction tailored for editing by migrating the delta denoising score (DDS)\ndistillation loss, originally used in 2D image editing to the three-dimensional\ndomain. This novel loss function surpasses the well-known score distillation\nsampling (SDS) loss in terms of suitability for editing purposes. Our\nexperimental results demonstrate that ED-NeRF achieves faster editing speed\nwhile producing improved output quality compared to state-of-the-art 3D editing\nmodels.\n","authors":["Jangho Park","Gihyun Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2310.02712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02710v1","updated":"2023-10-04T10:27:17Z","published":"2023-10-04T10:27:17Z","title":"Local Search GFlowNets","summary":"  Generative Flow Networks (GFlowNets) are amortized sampling methods that\nlearn a distribution over discrete objects proportional to their rewards.\nGFlowNets exhibit a remarkable ability to generate diverse samples, yet\noccasionally struggle to consistently produce samples with high rewards due to\nover-exploration on wide sample space. This paper proposes to train GFlowNets\nwith local search which focuses on exploiting high rewarded sample space to\nresolve this issue. Our main idea is to explore the local neighborhood via\ndestruction and reconstruction guided by backward and forward policies,\nrespectively. This allows biasing the samples toward high-reward solutions,\nwhich is not possible for a typical GFlowNet solution generation scheme which\nuses the forward policy to generate the solution from scratch. Extensive\nexperiments demonstrate a remarkable performance improvement in several\nbiochemical tasks. Source code is available:\n\\url{https://github.com/dbsxodud-11/ls_gfn}.\n","authors":["Minsu Kim","Taeyoung Yun","Emmanuel Bengio","Dinghuai Zhang","Yoshua Bengio","Sungsoo Ahn","Jinkyoo Park"],"pdf_url":"https://arxiv.org/pdf/2310.02710v1.pdf","comment":"18 pages, 17 figures"},{"id":"http://arxiv.org/abs/2310.02702v1","updated":"2023-10-04T10:15:57Z","published":"2023-10-04T10:15:57Z","title":"Tackling Hybrid Heterogeneity on Federated Optimization via Gradient\n  Diversity Maximization","summary":"  Federated learning refers to a distributed machine learning paradigm in which\ndata samples are decentralized and distributed among multiple clients. These\nsamples may exhibit statistical heterogeneity, which refers to data\ndistributions are not independent and identical across clients. Additionally,\nsystem heterogeneity, or variations in the computational power of the clients,\nintroduces biases into federated learning. The combined effects of statistical\nand system heterogeneity can significantly reduce the efficiency of federated\noptimization. However, the impact of hybrid heterogeneity is not rigorously\ndiscussed. This paper explores how hybrid heterogeneity affects federated\noptimization by investigating server-side optimization. The theoretical results\nindicate that adaptively maximizing gradient diversity in server update\ndirection can help mitigate the potential negative consequences of hybrid\nheterogeneity. To this end, we introduce a novel server-side gradient-based\noptimizer \\textsc{FedAWARE} with theoretical guarantees provided. Intensive\nexperiments in heterogeneous federated settings demonstrate that our proposed\noptimizer can significantly enhance the performance of federated learning\nacross varying degrees of hybrid heterogeneity.\n","authors":["Dun Zeng","Zenglin Xu","Yu Pan","Qifan Wang","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02702v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.02698v1","updated":"2023-10-04T10:08:01Z","published":"2023-10-04T10:08:01Z","title":"Exploring Federated Optimization by Reducing Variance of Adaptive\n  Unbiased Client Sampling","summary":"  Federated Learning (FL) systems usually sample a fraction of clients to\nconduct a training process. Notably, the variance of global estimates for\nupdating the global model built on information from sampled clients is highly\nrelated to federated optimization quality. This paper explores a line of \"free\"\nadaptive client sampling techniques in federated optimization, where the server\nbuilds promising sampling probability and reliable global estimates without\nrequiring additional local communication and computation. We capture a minor\nvariant in the sampling procedure and improve the global estimation\naccordingly. Based on that, we propose a novel sampler called K-Vib, which\nsolves an online convex optimization respecting client sampling in federated\noptimization. It achieves improved a linear speed up on regret bound\n$\\tilde{\\mathcal{O}}\\big(N^{\\frac{1}{3}}T^{\\frac{2}{3}}/K^{\\frac{4}{3}}\\big)$\nwith communication budget $K$. As a result, it significantly improves the\nperformance of federated optimization. Theoretical improvements and intensive\nexperiments on classic federated tasks demonstrate our findings.\n","authors":["Dun Zeng","Zenglin Xu","Yu Pan","Qifan Wang","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02698v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.02694v1","updated":"2023-10-04T10:03:15Z","published":"2023-10-04T10:03:15Z","title":"Probabilistic Block Term Decomposition for the Modelling of Higher-Order\n  Arrays","summary":"  Tensors are ubiquitous in science and engineering and tensor factorization\napproaches have become important tools for the characterization of higher order\nstructure. Factorizations includes the outer-product rank Canonical Polyadic\nDecomposition (CPD) as well as the multi-linear rank Tucker decomposition in\nwhich the Block-Term Decomposition (BTD) is a structured intermediate\ninterpolating between these two representations. Whereas CPD, Tucker, and BTD\nhave traditionally relied on maximum-likelihood estimation, Bayesian inference\nhas been use to form probabilistic CPD and Tucker. We propose, an efficient\nvariational Bayesian probabilistic BTD, which uses the von-Mises Fisher matrix\ndistribution to impose orthogonality in the multi-linear Tucker parts forming\nthe BTD. On synthetic and two real datasets, we highlight the Bayesian\ninference procedure and demonstrate using the proposed pBTD on noisy data and\nfor model order quantification. We find that the probabilistic BTD can quantify\nsuitable multi-linear structures providing a means for robust inference of\npatterns in multi-linear data.\n","authors":["Jesper Løve Hinrich","Morten Mørup"],"pdf_url":"https://arxiv.org/pdf/2310.02694v1.pdf","comment":"11 pages, preprint of submitted article"},{"id":"http://arxiv.org/abs/2310.02691v1","updated":"2023-10-04T10:02:49Z","published":"2023-10-04T10:02:49Z","title":"Robust Ocean Subgrid-Scale Parameterizations Using Fourier Neural\n  Operators","summary":"  In climate simulations, small-scale processes shape ocean dynamics but remain\ncomputationally expensive to resolve directly. For this reason, their\ncontributions are commonly approximated using empirical parameterizations,\nwhich lead to significant errors in long-term projections. In this work, we\ndevelop parameterizations based on Fourier Neural Operators, showcasing their\naccuracy and generalizability in comparison to other approaches. Finally, we\ndiscuss the potential and limitations of neural networks operating in the\nfrequency domain, paving the way for future investigation.\n","authors":["Victor Mangeleer","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2310.02691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11463v3","updated":"2023-10-04T09:48:22Z","published":"2023-05-19T06:33:57Z","title":"Generative Sliced MMD Flows with Riesz Kernels","summary":"  Maximum mean discrepancy (MMD) flows suffer from high computational costs in\nlarge scale computations. In this paper, we show that MMD flows with Riesz\nkernels $K(x,y) = - \\Vert x-y\\Vert^r$, $r \\in (0,2)$ have exceptional\nproperties which allow their efficient computation. We prove that the MMD of\nRiesz kernels, which is also known as energy distance, coincides with the MMD\nof their sliced version. As a consequence, the computation of gradients of MMDs\ncan be performed in the one-dimensional setting. Here, for $r=1$, a simple\nsorting algorithm can be applied to reduce the complexity from $O(MN+N^2)$ to\n$O((M+N)\\log(M+N))$ for two measures with $M$ and $N$ support points. As\nanother interesting follow-up result, the MMD of compactly supported measures\ncan be estimated from above and below by the Wasserstein-1 distance. For the\nimplementations we approximate the gradient of the sliced MMD by using only a\nfinite number $P$ of slices. We show that the resulting error has complexity\n$O(\\sqrt{d/P})$, where $d$ is the data dimension. These results enable us to\ntrain generative models by approximating MMD gradient flows by neural networks\neven for image applications. We demonstrate the efficiency of our model by\nimage generation on MNIST, FashionMNIST and CIFAR10.\n","authors":["Johannes Hertrich","Christian Wald","Fabian Altekrüger","Paul Hagemann"],"pdf_url":"https://arxiv.org/pdf/2305.11463v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02679v1","updated":"2023-10-04T09:39:05Z","published":"2023-10-04T09:39:05Z","title":"Diffusion Generative Flow Samplers: Improving learning signals through\n  partial trajectory optimization","summary":"  We tackle the problem of sampling from intractable high-dimensional density\nfunctions, a fundamental task that often appears in machine learning and\nstatistics. We extend recent sampling-based approaches that leverage controlled\nstochastic processes to model approximate samples from these target densities.\nThe main drawback of these approaches is that the training objective requires\nfull trajectories to compute, resulting in sluggish credit assignment issues\ndue to use of entire trajectories and a learning signal present only at the\nterminal time. In this work, we present Diffusion Generative Flow Samplers\n(DGFS), a sampling-based framework where the learning process can be tractably\nbroken down into short partial trajectory segments, via parameterizing an\nadditional \"flow function\". Our method takes inspiration from the theory\ndeveloped for generative flow networks (GFlowNets), allowing us to make use of\nintermediate learning signals and benefit from off-policy exploration\ncapabilities. Through a variety of challenging experiments, we demonstrate that\nDGFS results in more accurate estimates of the normalization constant than\nclosely-related prior methods.\n","authors":["Dinghuai Zhang","Ricky Tian Qi Chen","Cheng-Hao Liu","Aaron Courville","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.02679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02676v1","updated":"2023-10-04T09:27:39Z","published":"2023-10-04T09:27:39Z","title":"PostRainBench: A comprehensive benchmark and a new model for\n  precipitation forecasting","summary":"  Accurate precipitation forecasting is a vital challenge of both scientific\nand societal importance. Data-driven approaches have emerged as a widely used\nsolution for addressing this challenge. However, solely relying on data-driven\napproaches has limitations in modeling the underlying physics, making accurate\npredictions difficult. Coupling AI-based post-processing techniques with\ntraditional Numerical Weather Prediction (NWP) methods offers a more effective\nsolution for improving forecasting accuracy. Despite previous post-processing\nefforts, accurately predicting heavy rainfall remains challenging due to the\nimbalanced precipitation data across locations and complex relationships\nbetween multiple meteorological variables. To address these limitations, we\nintroduce the PostRainBench, a comprehensive multi-variable NWP post-processing\nbenchmark consisting of three datasets for NWP post-processing-based\nprecipitation forecasting. We propose CAMT, a simple yet effective Channel\nAttention Enhanced Multi-task Learning framework with a specially designed\nweighted loss function. Its flexible design allows for easy plug-and-play\nintegration with various backbones. Extensive experimental results on the\nproposed benchmark show that our method outperforms state-of-the-art methods by\n6.3%, 4.7%, and 26.8% in rain CSI on the three datasets respectively. Most\nnotably, our model is the first deep learning-based method to outperform\ntraditional Numerical Weather Prediction (NWP) approaches in extreme\nprecipitation conditions. It shows improvements of 15.6%, 17.4%, and 31.8% over\nNWP predictions in heavy rain CSI on respective datasets. These results\nhighlight the potential impact of our model in reducing the severe consequences\nof extreme weather events.\n","authors":["Yujin Tang","Jiaming Zhou","Xiang Pan","Zeying Gong","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2310.02676v1.pdf","comment":"16 pages, 3 figures. arXiv admin note: text overlap with\n  arXiv:2105.05537, arXiv:2206.15241 by other authors"},{"id":"http://arxiv.org/abs/2309.17348v2","updated":"2023-10-04T09:15:39Z","published":"2023-09-29T15:55:17Z","title":"Efficient Biologically Plausible Adversarial Training","summary":"  Artificial Neural Networks (ANNs) trained with Backpropagation (BP) show\nastounding performance and are increasingly often used in performing our daily\nlife tasks. However, ANNs are highly vulnerable to adversarial attacks, which\nalter inputs with small targeted perturbations that drastically disrupt the\nmodels' performance. The most effective method to make ANNs robust against\nthese attacks is adversarial training, in which the training dataset is\naugmented with exemplary adversarial samples. Unfortunately, this approach has\nthe drawback of increased training complexity since generating adversarial\nsamples is very computationally demanding. In contrast to ANNs, humans are not\nsusceptible to adversarial attacks. Therefore, in this work, we investigate\nwhether biologically-plausible learning algorithms are more robust against\nadversarial attacks than BP. In particular, we present an extensive comparative\nanalysis of the adversarial robustness of BP and Present the Error to Perturb\nthe Input To modulate Activity (PEPITA), a recently proposed\nbiologically-plausible learning algorithm, on various computer vision tasks. We\nobserve that PEPITA has higher intrinsic adversarial robustness and, with\nadversarial training, has a more favourable natural-vs-adversarial performance\ntrade-off as, for the same natural accuracies, PEPITA's adversarial accuracies\ndecrease in average by 0.26% and BP's by 8.05%.\n","authors":["Matilde Tristany Farinha","Thomas Ortner","Giorgia Dellaferrera","Benjamin Grewe","Angeliki Pantazi"],"pdf_url":"https://arxiv.org/pdf/2309.17348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05727v2","updated":"2023-10-04T09:11:39Z","published":"2023-04-12T09:34:13Z","title":"Preemptively Pruning Clever-Hans Strategies in Deep Neural Networks","summary":"  Explainable AI has become a popular tool for validating machine learning\nmodels. Mismatches between the explained model's decision strategy and the\nuser's domain knowledge (e.g. Clever Hans effects) have also been recognized as\na starting point for improving faulty models. However, it is less clear what to\ndo when the user and the explanation agree. In this paper, we demonstrate that\nacceptance of explanations by the user is not a guarantee for a machine\nlearning model to function well, in particular, some Clever Hans effects may\nremain undetected. Such hidden flaws of the model can nevertheless be\nmitigated, and we demonstrate this by contributing a new method,\nExplanation-Guided Exposure Minimization (EGEM), that preemptively prunes\nvariations in the ML model that have not been the subject of positive\nexplanation feedback. Experiments on natural image data demonstrate that our\napproach leads to models that strongly reduce their reliance on hidden Clever\nHans strategies, and consequently achieve higher accuracy on new data.\n","authors":["Lorenz Linhardt","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2304.05727v2.pdf","comment":"18 pages + supplement"},{"id":"http://arxiv.org/abs/2306.06991v2","updated":"2023-10-04T09:10:03Z","published":"2023-06-12T09:38:04Z","title":"Fast Diffusion Model","summary":"  Diffusion models (DMs) have been adopted across diverse fields with its\nremarkable abilities in capturing intricate data distributions. In this paper,\nwe propose a Fast Diffusion Model (FDM) to significantly speed up DMs from a\nstochastic optimization perspective for both faster training and sampling. We\nfirst find that the diffusion process of DMs accords with the stochastic\noptimization process of stochastic gradient descent (SGD) on a stochastic\ntime-variant problem. Then, inspired by momentum SGD that uses both gradient\nand an extra momentum to achieve faster and more stable convergence than SGD,\nwe integrate momentum into the diffusion process of DMs. This comes with a\nunique challenge of deriving the noise perturbation kernel from the\nmomentum-based diffusion process. To this end, we frame the process as a Damped\nOscillation system whose critically damped state -- the kernel solution --\navoids oscillation and yields a faster convergence speed of the diffusion\nprocess. Empirical results show that our FDM can be applied to several popular\nDM frameworks, e.g., VP, VE, and EDM, and reduces their training cost by about\n50% with comparable image synthesis performance on CIFAR-10, FFHQ, and AFHQv2\ndatasets. Moreover, FDM decreases their sampling steps by about 3x to achieve\nsimilar performance under the same samplers. The code is available at\nhttps://github.com/sail-sg/FDM.\n","authors":["Zike Wu","Pan Zhou","Kenji Kawaguchi","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.06991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02664v1","updated":"2023-10-04T09:04:20Z","published":"2023-10-04T09:04:20Z","title":"On Memorization in Diffusion Models","summary":"  Due to their capacity to generate novel and high-quality samples, diffusion\nmodels have attracted significant research interest in recent years. Notably,\nthe typical training objective of diffusion models, i.e., denoising score\nmatching, has a closed-form optimal solution that can only generate training\ndata replicating samples. This indicates that a memorization behavior is\ntheoretically expected, which contradicts the common generalization ability of\nstate-of-the-art diffusion models, and thus calls for a deeper understanding.\nLooking into this, we first observe that memorization behaviors tend to occur\non smaller-sized datasets, which motivates our definition of effective model\nmemorization (EMM), a metric measuring the maximum size of training data at\nwhich a learned diffusion model approximates its theoretical optimum. Then, we\nquantify the impact of the influential factors on these memorization behaviors\nin terms of EMM, focusing primarily on data distribution, model configuration,\nand training procedure. Besides comprehensive empirical results identifying the\ninfluential factors, we surprisingly find that conditioning training data on\nuninformative random labels can significantly trigger the memorization in\ndiffusion models. Our study holds practical significance for diffusion model\nusers and offers clues to theoretical research in deep generative models. Code\nis available at https://github.com/sail-sg/DiffMemorize.\n","authors":["Xiangming Gu","Chao Du","Tianyu Pang","Chongxuan Li","Min Lin","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05739v3","updated":"2023-10-04T09:03:47Z","published":"2023-06-09T08:13:06Z","title":"Leaping through tree space: continuous phylogenetic inference for rooted\n  and unrooted trees","summary":"  Phylogenetics is now fundamental in life sciences, providing insights into\nthe earliest branches of life and the origins and spread of epidemics. However,\nfinding suitable phylogenies from the vast space of possible trees remains\nchallenging. To address this problem, for the first time, we perform both tree\nexploration and inference in a continuous space where the computation of\ngradients is possible. This continuous relaxation allows for major leaps across\ntree space in both rooted and unrooted trees, and is less susceptible to\nconvergence to local minima. Our approach outperforms the current best methods\nfor inference on unrooted trees and, in simulation, accurately infers the tree\nand root in ultrametric cases. The approach is effective in cases of empirical\ndata with negligible amounts of data, which we demonstrate on the phylogeny of\njawed vertebrates. Indeed, only a few genes with an ultrametric signal were\ngenerally sufficient for resolving the major lineages of vertebrate. With\ncubic-time complexity and efficient optimisation via automatic differentiation,\nour method presents an effective way forwards for exploring the most difficult,\ndata-deficient phylogenetic questions.\n","authors":["Matthew J Penn","Neil Scheidwasser","Joseph Penn","Christl A Donnelly","David A Duchêne","Samir Bhatt"],"pdf_url":"https://arxiv.org/pdf/2306.05739v3.pdf","comment":"16 pages, 3 figures, 15 supplementary pages, 3 supplementary figures;\n  overhaul of Methods and Results sections"},{"id":"http://arxiv.org/abs/2303.01590v4","updated":"2023-10-04T08:47:30Z","published":"2023-03-02T21:27:54Z","title":"Technical report: Graph Neural Networks go Grammatical","summary":"  This paper introduces a framework for formally establishing a connection\nbetween a portion of an algebraic language and a Graph Neural Network (GNN).\nThe framework leverages Context-Free Grammars (CFG) to organize algebraic\noperations into generative rules that can be translated into a GNN layer model.\nAs CFGs derived directly from a language tend to contain redundancies in their\nrules and variables, we present a grammar reduction scheme. By applying this\nstrategy, we define a CFG that conforms to the third-order Weisfeiler-Lehman\n(3-WL) test using MATLANG. From this 3-WL CFG, we derive a GNN model, named\nG$^2$N$^2$, which is provably 3-WL compliant. Through various experiments, we\ndemonstrate the superior efficiency of G$^2$N$^2$ compared to other 3-WL GNNs\nacross numerous downstream tasks. Specifically, one experiment highlights the\nbenefits of grammar reduction within our framework.\n","authors":["Jason Piquenot","Aldo Moscatelli","Maxime Bérar","Pierre Héroux","Romain raveaux","Jean-Yves Ramel","Sébastien Adam"],"pdf_url":"https://arxiv.org/pdf/2303.01590v4.pdf","comment":"24 pages, 11 figures"},{"id":"http://arxiv.org/abs/2303.17841v2","updated":"2023-10-04T08:32:52Z","published":"2023-03-31T07:06:24Z","title":"A Benchmark Generative Probabilistic Model for Weak Supervised Learning","summary":"  Finding relevant and high-quality datasets to train machine learning models\nis a major bottleneck for practitioners. Furthermore, to address ambitious\nreal-world use-cases there is usually the requirement that the data come\nlabelled with high-quality annotations that can facilitate the training of a\nsupervised model. Manually labelling data with high-quality labels is generally\na time-consuming and challenging task and often this turns out to be the\nbottleneck in a machine learning project. Weak Supervised Learning (WSL)\napproaches have been developed to alleviate the annotation burden by offering\nan automatic way of assigning approximate labels (pseudo-labels) to unlabelled\ndata based on heuristics, distant supervision and knowledge bases. We apply\nprobabilistic generative latent variable models (PLVMs), trained on heuristic\nlabelling representations of the original dataset, as an accurate, fast and\ncost-effective way to generate pseudo-labels. We show that the PLVMs achieve\nstate-of-the-art performance across four datasets. For example, they achieve\n22% points higher F1 score than Snorkel in the class-imbalanced Spouse dataset.\nPLVMs are plug-and-playable and are a drop-in replacement to existing WSL\nframeworks (e.g. Snorkel) or they can be used as benchmark models for more\ncomplicated algorithms, giving practitioners a compelling accuracy boost.\n","authors":["Georgios Papadopoulos","Fran Silavong","Sean Moran"],"pdf_url":"https://arxiv.org/pdf/2303.17841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11492v3","updated":"2023-10-04T08:30:44Z","published":"2022-06-23T06:24:50Z","title":"Gradual Domain Adaptation via Normalizing Flows","summary":"  Standard domain adaptation methods do not work well when a large gap exists\nbetween the source and target domains. Gradual domain adaptation is one of the\napproaches used to address the problem. It involves leveraging the intermediate\ndomain, which gradually shifts from the source domain to the target domain. In\nprevious work, it is assumed that the number of intermediate domains is large\nand the distance between adjacent domains is small; hence, the gradual domain\nadaptation algorithm, involving self-training with unlabeled datasets, is\napplicable. In practice, however, gradual self-training will fail because the\nnumber of intermediate domains is limited and the distance between adjacent\ndomains is large. We propose the use of normalizing flows to deal with this\nproblem while maintaining the framework of unsupervised domain adaptation. The\nproposed method learns a transformation from the distribution of the target\ndomain to the Gaussian mixture distribution via the source domain. We evaluate\nour proposed method by experiments using real-world datasets and confirm that\nit mitigates the above-explained problem and improves the classification\nperformance.\n","authors":["Shogo Sagawa","Hideitsu Hino"],"pdf_url":"https://arxiv.org/pdf/2206.11492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02654v1","updated":"2023-10-04T08:25:03Z","published":"2023-10-04T08:25:03Z","title":"A Study of Quantisation-aware Training on Time Series Transformer Models\n  for Resource-constrained FPGAs","summary":"  This study explores the quantisation-aware training (QAT) on time series\nTransformer models. We propose a novel adaptive quantisation scheme that\ndynamically selects between symmetric and asymmetric schemes during the QAT\nphase. Our approach demonstrates that matching the quantisation scheme to the\nreal data distribution can reduce computational overhead while maintaining\nacceptable precision. Moreover, our approach is robust when applied to\nreal-world data and mixed-precision quantisation, where most objects are\nquantised to 4 bits. Our findings inform model quantisation and deployment\ndecisions while providing a foundation for advancing quantisation techniques.\n","authors":["Tianheng Ling","Chao Qian","Lukas Einhaus","Gregor Schiele"],"pdf_url":"https://arxiv.org/pdf/2310.02654v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2310.02651v1","updated":"2023-10-04T08:19:04Z","published":"2023-10-04T08:19:04Z","title":"Hire When You Need to: Gradual Participant Recruitment for Auction-based\n  Federated Learning","summary":"  The success of federated Learning (FL) depends on the quantity and quality of\nthe data owners (DOs) as well as their motivation to join FL model training.\nReputation-based FL participant selection methods have been proposed. However,\nthey still face the challenges of the cold start problem and potential\nselection bias towards highly reputable DOs. Such a bias can result in lower\nreputation DOs being prematurely excluded from future FL training rounds,\nthereby reducing the diversity of training data and the generalizability of the\nresulting models. To address these challenges, we propose the Gradual\nParticipant Selection scheme for Auction-based Federated Learning (GPS-AFL).\nUnlike existing AFL incentive mechanisms which generally assume that all DOs\nrequired for an FL task must be selected in one go, GPS-AFL gradually selects\nthe required DOs over multiple rounds of training as more information is\nrevealed through repeated interactions. It is designed to strike a balance\nbetween cost saving and performance enhancement, while mitigating the drawbacks\nof selection bias in reputation-based FL. Extensive experiments based on\nreal-world datasets demonstrate the significant advantages of GPS-AFL, which\nreduces costs by 33.65% and improved total utility by 2.91%, on average\ncompared to the best-performing state-of-the-art approach.\n","authors":["Xavier Tan","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2310.02651v1.pdf","comment":"9 Pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2304.08319v2","updated":"2023-10-04T08:06:38Z","published":"2023-04-17T14:39:56Z","title":"Examining Computational Performance of Unsupervised Concept Drift\n  Detection: A Survey and Beyond","summary":"  Concept drift detection is crucial for many AI systems to ensure the system's\nreliability. These systems often have to deal with large amounts of data or\nreact in real time. Thus, drift detectors must meet computational requirements\nor constraints with a comprehensive performance evaluation. However, so far,\nthe focus of developing drift detectors is on detection quality, e.g.~accuracy,\nbut not on computational performance, such as running time. We show that the\nprevious works consider computational performance only as a secondary objective\nand do not have a benchmark for such evaluation. Hence, we propose a set of\nmetrics that considers both, computational performance and detection quality.\nAmong others, our set of metrics includes the Relative Runtime Overhead RRO to\nevaluate a drift detector's computational impact on an AI system. This work\nfocuses on unsupervised drift detectors, not being restricted to the\navailability of labeled data. We measure the computational performance based on\nthe RRO and memory consumption of four available unsupervised drift detectors\non five different data sets. The range of the RRO reaches from 1.01 to 20.15.\nMoreover, we measure state-of-the-art detection quality metrics to discuss our\nevaluation results and show the necessity of thorough computational performance\nconsiderations for drift detectors. Additionally, we highlight and explain\nrequirements for a comprehensive benchmark of drift detectors. Our\ninvestigations can also be extended for supervised drift detection.\n","authors":["Elias Werner","Nishant Kumar","Matthias Lieber","Sunna Torge","Stefan Gumhold","Wolfgang E. Nagel"],"pdf_url":"https://arxiv.org/pdf/2304.08319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02635v1","updated":"2023-10-04T07:56:42Z","published":"2023-10-04T07:56:42Z","title":"Foundation Reinforcement Learning: towards Embodied Generalist Agents\n  with Foundation Prior Assistance","summary":"  Recently, people have shown that large-scale pre-training from internet-scale\ndata is the key to building generalist models, as witnessed in NLP. To build\nembodied generalist agents, we and many other researchers hypothesize that such\nfoundation prior is also an indispensable component. However, it is unclear\nwhat is the proper concrete form to represent those embodied foundation priors\nand how they should be used in the downstream task. In this paper, we propose\nan intuitive and effective set of embodied priors that consist of foundation\npolicy, value, and success reward. The proposed priors are based on the\ngoal-conditioned MDP. To verify their effectiveness, we instantiate an\nactor-critic method assisted by the priors, called Foundation Actor-Critic\n(FAC). We name our framework as Foundation Reinforcement Learning (FRL), since\nit completely relies on embodied foundation priors to explore, learn and\nreinforce. The benefits of FRL are threefold. (1) Sample efficient. With\nfoundation priors, FAC learns significantly faster than traditional RL. Our\nevaluation on the Meta-World has proved that FAC can achieve 100% success rates\nfor 7/8 tasks under less than 200k frames, which outperforms the baseline\nmethod with careful manual-designed rewards under 1M frames. (2) Robust to\nnoisy priors. Our method tolerates the unavoidable noise in embodied foundation\nmodels. We show that FAC works well even under heavy noise or quantization\nerrors. (3) Minimal human intervention: FAC completely learns from the\nfoundation priors, without the need of human-specified dense reward, or\nproviding teleoperated demos. Thus, FAC can be easily scaled up. We believe our\nFRL framework could enable the future robot to autonomously explore and learn\nwithout human intervention in the physical world. In summary, our proposed FRL\nis a novel and powerful learning paradigm, towards achieving embodied\ngeneralist agents.\n","authors":["Weirui Ye","Yunsheng Zhang","Mengchen Wang","Shengjie Wang","Xianfan Gu","Pieter Abbeel","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2310.02635v1.pdf","comment":"Submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.02633v1","updated":"2023-10-04T07:55:51Z","published":"2023-10-04T07:55:51Z","title":"Multi-rules mining algorithm for combinatorially exploded decision trees\n  with modified Aitchison-Aitken function-based Bayesian optimization","summary":"  Decision trees offer the benefit of easy interpretation because they allow\nthe classification of input data based on if--then rules. However, as decision\ntrees are constructed by an algorithm that achieves clear classification with\nminimum necessary rules, the trees possess the drawback of extracting only\nminimum rules, even when various latent rules exist in data. Approaches that\nconstruct multiple trees using randomly selected feature subsets do exist.\nHowever, the number of trees that can be constructed remains at the same scale\nbecause the number of feature subsets is a combinatorial explosion.\nAdditionally, when multiple trees are constructed, numerous rules are\ngenerated, of which several are untrustworthy and/or highly similar. Therefore,\nwe propose \"MAABO-MT\" and \"GS-MRM\" algorithms that strategically construct\ntrees with high estimation performance among all possible trees with small\ncomputational complexity and extract only reliable and non-similar rules,\nrespectively. Experiments are conducted using several open datasets to analyze\nthe effectiveness of the proposed method. The results confirm that MAABO-MT can\ndiscover reliable rules at a lower computational cost than other methods that\nrely on randomness. Furthermore, the proposed method is confirmed to provide\ndeeper insights than single decision trees commonly used in previous studies.\nTherefore, MAABO-MT and GS-MRM can efficiently extract rules from\ncombinatorially exploded decision trees.\n","authors":["Yuto Omae","Masaya Mori","Yohei Kakimoto"],"pdf_url":"https://arxiv.org/pdf/2310.02633v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2207.10574v2","updated":"2023-10-04T07:52:19Z","published":"2022-07-20T13:37:57Z","title":"Co-Located Human-Human Interaction Analysis using Nonverbal Cues: A\n  Survey","summary":"  Automated co-located human-human interaction analysis has been addressed by\nthe use of nonverbal communication as measurable evidence of social and\npsychological phenomena. We survey the computing studies (since 2010) detecting\nphenomena related to social traits (e.g., leadership, dominance, personality\ntraits), social roles/relations, and interaction dynamics (e.g., group\ncohesion, engagement, rapport). Our target is to identify the nonverbal cues\nand computational methodologies resulting in effective performance. This survey\ndiffers from its counterparts by involving the widest spectrum of social\nphenomena and interaction settings (free-standing conversations, meetings,\ndyads, and crowds). We also present a comprehensive summary of the related\ndatasets and outline future research directions which are regarding the\nimplementation of artificial intelligence, dataset curation, and\nprivacy-preserving interaction analysis. Some major observations are: the most\noften used nonverbal cue, computational method, interaction environment, and\nsensing approach are speaking activity, support vector machines, and meetings\ncomposed of 3-4 persons equipped with microphones and cameras, respectively;\nmultimodal features are prominently performing better; deep learning\narchitectures showed improved performance in overall, but there exist many\nphenomena whose detection has never been implemented through deep models. We\nalso identified several limitations such as the lack of scalable benchmarks,\nannotation reliability tests, cross-dataset experiments, and explainability\nanalysis.\n","authors":["Cigdem Beyan","Alessandro Vinciarelli","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2207.10574v2.pdf","comment":"This is the author's version of the work. It is posted here for your\n  personal use. Not for redistribution. The definitive version was published in\n  ACM Computing Surveys, https://doi.org/10.1145/3626516"},{"id":"http://arxiv.org/abs/2305.18864v2","updated":"2023-10-04T07:50:15Z","published":"2023-05-30T08:55:59Z","title":"Stochastic Gradient Langevin Dynamics Based on Quantization with\n  Increasing Resolution","summary":"  Stochastic learning dynamics based on Langevin or Levy stochastic\ndifferential equations (SDEs) in deep neural networks control the variance of\nnoise by varying the size of the mini-batch or directly those of injecting\nnoise. Since the noise variance affects the approximation performance, the\ndesign of the additive noise is significant in SDE-based learning and practical\nimplementation. In this paper, we propose an alternative stochastic descent\nlearning equation based on quantized optimization for non-convex objective\nfunctions, adopting a stochastic analysis perspective. The proposed method\nemploys a quantized optimization approach that utilizes Langevin SDE dynamics,\nallowing for controllable noise with an identical distribution without the need\nfor additive noise or adjusting the mini-batch size. Numerical experiments\ndemonstrate the effectiveness of the proposed algorithm on vanilla convolution\nneural network(CNN) models and the ResNet-50 architecture across various data\nsets. Furthermore, we provide a simple PyTorch implementation of the proposed\nalgorithm.\n","authors":["JInwuk Seok","Changsik Cho"],"pdf_url":"https://arxiv.org/pdf/2305.18864v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2305.16311v2","updated":"2023-10-04T07:38:36Z","published":"2023-05-25T17:59:04Z","title":"Break-A-Scene: Extracting Multiple Concepts from a Single Image","summary":"  Text-to-image model personalization aims to introduce a user-provided concept\nto the model, allowing its synthesis in diverse contexts. However, current\nmethods primarily focus on the case of learning a single concept from multiple\nimages with variations in backgrounds and poses, and struggle when adapted to a\ndifferent scenario. In this work, we introduce the task of textual scene\ndecomposition: given a single image of a scene that may contain several\nconcepts, we aim to extract a distinct text token for each concept, enabling\nfine-grained control over the generated scenes. To this end, we propose\naugmenting the input image with masks that indicate the presence of target\nconcepts. These masks can be provided by the user or generated automatically by\na pre-trained segmentation model. We then present a novel two-phase\ncustomization process that optimizes a set of dedicated textual embeddings\n(handles), as well as the model weights, striking a delicate balance between\naccurately capturing the concepts and avoiding overfitting. We employ a masked\ndiffusion loss to enable handles to generate their assigned concepts,\ncomplemented by a novel loss on cross-attention maps to prevent entanglement.\nWe also introduce union-sampling, a training strategy aimed to improve the\nability of combining multiple concepts in generated images. We use several\nautomatic metrics to quantitatively compare our method against several\nbaselines, and further affirm the results using a user study. Finally, we\nshowcase several applications of our method. Project page is available at:\nhttps://omriavrahami.com/break-a-scene/\n","authors":["Omri Avrahami","Kfir Aberman","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2305.16311v2.pdf","comment":"SIGGRAPH Asia 2023. Project page: at:\n  https://omriavrahami.com/break-a-scene/ Video:\n  https://www.youtube.com/watch?v=-9EA-BhizgM"},{"id":"http://arxiv.org/abs/2309.17264v2","updated":"2023-10-04T07:37:28Z","published":"2023-09-29T14:17:24Z","title":"A Foundation Model for General Moving Object Segmentation in Medical\n  Images","summary":"  Medical image segmentation aims to delineate the anatomical or pathological\nstructures of interest, playing a crucial role in clinical diagnosis. A\nsubstantial amount of high-quality annotated data is crucial for constructing\nhigh-precision deep segmentation models. However, medical annotation is highly\ncumbersome and time-consuming, especially for medical videos or 3D volumes, due\nto the huge labeling space and poor inter-frame consistency. Recently, a\nfundamental task named Moving Object Segmentation (MOS) has made significant\nadvancements in natural images. Its objective is to delineate moving objects\nfrom the background within image sequences, requiring only minimal annotations.\nIn this paper, we propose the first foundation model, named iMOS, for MOS in\nmedical images. Extensive experiments on a large multi-modal medical dataset\nvalidate the effectiveness of the proposed iMOS. Specifically, with the\nannotation of only a small number of images in the sequence, iMOS can achieve\nsatisfactory tracking and segmentation performance of moving objects throughout\nthe entire sequence in bi-directions. We hope that the proposed iMOS can help\naccelerate the annotation speed of experts, and boost the development of\nmedical foundation models.\n","authors":["Zhongnuo Yan","Tong Han","Yuhao Huang","Lian Liu","Han Zhou","Jiongquan Chen","Wenlong Shi","Yan Cao","Xin Yang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2309.17264v2.pdf","comment":"6 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.00681v2","updated":"2023-10-04T07:29:20Z","published":"2023-10-01T14:13:09Z","title":"PharmacoNet: Accelerating Large-Scale Virtual Screening by Deep\n  Pharmacophore Modeling","summary":"  As the size of accessible compound libraries expands to over 10 billion, the\nneed for more efficient structure-based virtual screening methods is emerging.\nDifferent pre-screening methods have been developed to rapidly screen the\nlibrary, but the structure-based methods applicable to general proteins are\nstill lacking: the challenge is to predict the binding pose between proteins\nand ligands and perform scoring in an extremely short time. We introduce\nPharmacoNet, a deep learning framework that identifies the optimal 3D\npharmacophore arrangement which a ligand should have for stable binding from\nthe binding site. By coarse-grained graph matching between ligands and the\ngenerated pharmacophore arrangement, we solve the expensive binding pose\nsampling and scoring procedures of existing methods in a single step.\nPharmacoNet is significantly faster than state-of-the-art structure-based\napproaches, yet reasonably accurate with a simple scoring function.\nFurthermore, we show the promising result that PharmacoNet effectively retains\nhit candidates even under the high pre-screening filtration rates. Overall, our\nstudy uncovers the hitherto untapped potential of a pharmacophore modeling\napproach in deep learning-based drug discovery.\n","authors":["Seonghwan Seo","Woo Youn Kim"],"pdf_url":"https://arxiv.org/pdf/2310.00681v2.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.01737v2","updated":"2023-10-04T07:28:17Z","published":"2023-10-03T01:55:54Z","title":"Blending Imitation and Reinforcement Learning for Robust Policy\n  Improvement","summary":"  While reinforcement learning (RL) has shown promising performance, its sample\ncomplexity continues to be a substantial hurdle, restricting its broader\napplication across a variety of domains. Imitation learning (IL) utilizes\noracles to improve sample efficiency, yet it is often constrained by the\nquality of the oracles deployed. which actively interleaves between IL and RL\nbased on an online estimate of their performance. RPI draws on the strengths of\nIL, using oracle queries to facilitate exploration, an aspect that is notably\nchallenging in sparse-reward RL, particularly during the early stages of\nlearning. As learning unfolds, RPI gradually transitions to RL, effectively\ntreating the learned policy as an improved oracle. This algorithm is capable of\nlearning from and improving upon a diverse set of black-box oracles. Integral\nto RPI are Robust Active Policy Selection (RAPS) and Robust Policy Gradient\n(RPG), both of which reason over whether to perform state-wise imitation from\nthe oracles or learn from its own value function when the learner's performance\nsurpasses that of the oracles in a specific state. Empirical evaluations and\ntheoretical analysis validate that RPI excels in comparison to existing\nstate-of-the-art methodologies, demonstrating superior performance across\nvarious benchmark domains.\n","authors":["Xuefeng Liu","Takuma Yoneda","Rick L. Stevens","Matthew R. Walter","Yuxin Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02619v1","updated":"2023-10-04T07:14:43Z","published":"2023-10-04T07:14:43Z","title":"Generative Modeling of Regular and Irregular Time Series Data via\n  Koopman VAEs","summary":"  Generating realistic time series data is important for many engineering and\nscientific applications. Existing work tackles this problem using generative\nadversarial networks (GANs). However, GANs are often unstable during training,\nand they can suffer from mode collapse. While variational autoencoders (VAEs)\nare known to be more robust to these issues, they are (surprisingly) less often\nconsidered for time series generation. In this work, we introduce Koopman VAE\n(KVAE), a new generative framework that is based on a novel design for the\nmodel prior, and that can be optimized for either regular and irregular\ntraining data. Inspired by Koopman theory, we represent the latent conditional\nprior dynamics using a linear map. Our approach enhances generative modeling\nwith two desired features: (i) incorporating domain knowledge can be achieved\nby leverageing spectral tools that prescribe constraints on the eigenvalues of\nthe linear map; and (ii) studying the qualitative behavior and stablity of the\nsystem can be performed using tools from dynamical systems theory. Our results\nshow that KVAE outperforms state-of-the-art GAN and VAE methods across several\nchallenging synthetic and real-world time series generation benchmarks. Whether\ntrained on regular or irregular data, KVAE generates time series that improve\nboth discriminative and predictive metrics. We also present visual evidence\nsuggesting that KVAE learns probability density functions that better\napproximate empirical ground truth distributions.\n","authors":["Ilan Naiman","N. Benjamin Erichson","Pu Ren","Michael W. Mahoney","Omri Azencot"],"pdf_url":"https://arxiv.org/pdf/2310.02619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02611v1","updated":"2023-10-04T06:52:03Z","published":"2023-10-04T06:52:03Z","title":"Analyzing and Improving OT-based Adversarial Networks","summary":"  Optimal Transport (OT) problem aims to find a transport plan that bridges two\ndistributions while minimizing a given cost function. OT theory has been widely\nutilized in generative modeling. In the beginning, OT distance has been used as\na measure for assessing the distance between data and generated distributions.\nRecently, OT transport map between data and prior distributions has been\nutilized as a generative model. These OT-based generative models share a\nsimilar adversarial training objective. In this paper, we begin by unifying\nthese OT-based adversarial methods within a single framework. Then, we\nelucidate the role of each component in training dynamics through a\ncomprehensive analysis of this unified framework. Moreover, we suggest a simple\nbut novel method that improves the previously best-performing OT-based model.\nIntuitively, our approach conducts a gradual refinement of the generated\ndistribution, progressively aligning it with the data distribution. Our\napproach achieves a FID score of 2.51 on CIFAR-10, outperforming unified\nOT-based adversarial approaches.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2310.02611v1.pdf","comment":"20 pages, 13 figures"},{"id":"http://arxiv.org/abs/2310.02094v2","updated":"2023-10-04T06:48:53Z","published":"2023-10-03T14:38:12Z","title":"CoNO: Complex Neural Operator for Continuous Dynamical Systems","summary":"  Neural operators extend data-driven models to map between\ninfinite-dimensional functional spaces. These models have successfully solved\ncontinuous dynamical systems represented by differential equations, viz weather\nforecasting, fluid flow, or solid mechanics. However, the existing operators\nstill rely on real space, thereby losing rich representations potentially\ncaptured in the complex space by functional transforms. In this paper, we\nintroduce a Complex Neural Operator (CoNO), that parameterizes the integral\nkernel in the complex fractional Fourier domain. Additionally, the model\nemploying a complex-valued neural network along with aliasing-free activation\nfunctions preserves the complex values and complex algebraic properties,\nthereby enabling improved representation, robustness to noise, and\ngeneralization. We show that the model effectively captures the underlying\npartial differential equation with a single complex fractional Fourier\ntransform. We perform an extensive empirical evaluation of CoNO on several\ndatasets and additional tasks such as zero-shot super-resolution, evaluation of\nout-of-distribution data, data efficiency, and robustness to noise. CoNO\nexhibits comparable or superior performance to all the state-of-the-art models\nin these tasks. Altogether, CoNO presents a robust and superior model for\nmodeling continuous dynamical systems, providing a fillip to scientific machine\nlearning.\n","authors":["Karn Tiwari","N M Anoop Krishnan","Prathosh A P"],"pdf_url":"https://arxiv.org/pdf/2310.02094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02606v1","updated":"2023-10-04T06:42:33Z","published":"2023-10-04T06:42:33Z","title":"Learning adjacency matrix for dynamic graph neural network","summary":"  In recent work, [1] introduced the concept of using a Block Adjacency Matrix\n(BA) for the representation of spatio-temporal data. While their method\nsuccessfully concatenated adjacency matrices to encapsulate spatio-temporal\nrelationships in a single graph, it formed a disconnected graph. This\nlimitation hampered the ability of Graph Convolutional Networks (GCNs) to\nperform message passing across nodes belonging to different time steps, as no\ntemporal links were present. To overcome this challenge, we introduce an\nencoder block specifically designed to learn these missing temporal links. The\nencoder block processes the BA and predicts connections between previously\nunconnected subgraphs, resulting in a Spatio-Temporal Block Adjacency Matrix\n(STBAM). This enriched matrix is then fed into a Graph Neural Network (GNN) to\ncapture the complex spatio-temporal topology of the network. Our evaluations on\nbenchmark datasets, surgVisDom and C2D2, demonstrate that our method, with\nslightly higher complexity, achieves superior results compared to\nstate-of-the-art results. Our approach's computational overhead remains\nsignificantly lower than conventional non-graph-based methodologies for\nspatio-temporal data.\n","authors":["Osama Ahmad","Omer Abdul Jalil","Usman Nazir","Murtaza Taj"],"pdf_url":"https://arxiv.org/pdf/2310.02606v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.02605v1","updated":"2023-10-04T06:37:43Z","published":"2023-10-04T06:37:43Z","title":"Multi-Agent Reinforcement Learning for Power Grid Topology Optimization","summary":"  Recent challenges in operating power networks arise from increasing energy\ndemands and unpredictable renewable sources like wind and solar. While\nreinforcement learning (RL) shows promise in managing these networks, through\ntopological actions like bus and line switching, efficiently handling large\naction spaces as networks grow is crucial. This paper presents a hierarchical\nmulti-agent reinforcement learning (MARL) framework tailored for these\nexpansive action spaces, leveraging the power grid's inherent hierarchical\nnature. Experimental results indicate the MARL framework's competitive\nperformance with single-agent RL methods. We also compare different RL\nalgorithms for lower-level agents alongside different policies for higher-order\nagents.\n","authors":["Erica van der Sar","Alessandro Zocca","Sandjai Bhulai"],"pdf_url":"https://arxiv.org/pdf/2310.02605v1.pdf","comment":"Submitted to PSCC 2024"},{"id":"http://arxiv.org/abs/2305.14777v2","updated":"2023-10-04T06:10:48Z","published":"2023-05-24T06:31:05Z","title":"Generative Modeling through the Semi-dual Formulation of Unbalanced\n  Optimal Transport","summary":"  Optimal Transport (OT) problem investigates a transport map that bridges two\ndistributions while minimizing a given cost function. In this regard, OT\nbetween tractable prior distribution and data has been utilized for generative\nmodeling tasks. However, OT-based methods are susceptible to outliers and face\noptimization challenges during training. In this paper, we propose a novel\ngenerative model based on the semi-dual formulation of Unbalanced Optimal\nTransport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution\nmatching. This approach provides better robustness against outliers, stability\nduring training, and faster convergence. We validate these properties\nempirically through experiments. Moreover, we study the theoretical upper-bound\nof divergence between distributions in UOT. Our model outperforms existing\nOT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 5.80\non CelebA-HQ-256. The code is available at\n\\url{https://github.com/Jae-Moo/UOTM}.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2305.14777v2.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2303.10358v2","updated":"2023-10-04T05:49:41Z","published":"2023-03-18T08:15:15Z","title":"Neural Frailty Machine: Beyond proportional hazard assumption in neural\n  survival regressions","summary":"  We present neural frailty machine (NFM), a powerful and flexible neural\nmodeling framework for survival regressions. The NFM framework utilizes the\nclassical idea of multiplicative frailty in survival analysis to capture\nunobserved heterogeneity among individuals, at the same time being able to\nleverage the strong approximation power of neural architectures for handling\nnonlinear covariate dependence. Two concrete models are derived under the\nframework that extends neural proportional hazard models and nonparametric\nhazard regression models. Both models allow efficient training under the\nlikelihood objective. Theoretically, for both proposed models, we establish\nstatistical guarantees of neural function approximation with respect to\nnonparametric components via characterizing their rate of convergence.\nEmpirically, we provide synthetic experiments that verify our theoretical\nstatements. We also conduct experimental evaluations over $6$ benchmark\ndatasets of different scales, showing that the proposed NFM models outperform\nstate-of-the-art survival models in terms of predictive performance. Our code\nis publicly availabel at https://github.com/Rorschach1989/nfm\n","authors":["Ruofan Wu","Jiawei Qiao","Mingzhe Wu","Wen Yu","Ming Zheng","Tengfei Liu","Tianyi Zhang","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.10358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01892v2","updated":"2023-10-04T05:44:46Z","published":"2023-10-03T08:54:06Z","title":"FiGURe: Simple and Efficient Unsupervised Node Representations with\n  Filter Augmentations","summary":"  Unsupervised node representations learnt using contrastive learning-based\nmethods have shown good performance on downstream tasks. However, these methods\nrely on augmentations that mimic low-pass filters, limiting their performance\non tasks requiring different eigen-spectrum parts. This paper presents a simple\nfilter-based augmentation method to capture different parts of the\neigen-spectrum. We show significant improvements using these augmentations.\nFurther, we show that sharing the same weights across these different filter\naugmentations is possible, reducing the computational load. In addition,\nprevious works have shown that good performance on downstream tasks requires\nhigh dimensional representations. Working with high dimensions increases the\ncomputations, especially when multiple augmentations are involved. We mitigate\nthis problem and recover good performance through lower dimensional embeddings\nusing simple random Fourier feature projections. Our method, FiGURe achieves an\naverage gain of up to 4.4%, compared to the state-of-the-art unsupervised\nmodels, across all datasets in consideration, both homophilic and heterophilic.\nOur code can be found at: https://github.com/microsoft/figure.\n","authors":["Chanakya Ekbote","Ajinkya Pankaj Deshpande","Arun Iyer","Ramakrishna Bairi","Sundararajan Sellamanickam"],"pdf_url":"https://arxiv.org/pdf/2310.01892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02588v1","updated":"2023-10-04T05:09:50Z","published":"2023-10-04T05:09:50Z","title":"ViT-ReciproCAM: Gradient and Attention-Free Visual Explanations for\n  Vision Transformer","summary":"  This paper presents a novel approach to address the challenges of\nunderstanding the prediction process and debugging prediction errors in Vision\nTransformers (ViT), which have demonstrated superior performance in various\ncomputer vision tasks such as image classification and object detection. While\nseveral visual explainability techniques, such as CAM, Grad-CAM, Score-CAM, and\nRecipro-CAM, have been extensively researched for Convolutional Neural Networks\n(CNNs), limited research has been conducted on ViT. Current state-of-the-art\nsolutions for ViT rely on class agnostic Attention-Rollout and Relevance\ntechniques. In this work, we propose a new gradient-free visual explanation\nmethod for ViT, called ViT-ReciproCAM, which does not require attention matrix\nand gradient information. ViT-ReciproCAM utilizes token masking and generated\nnew layer outputs from the target layer's input to exploit the correlation\nbetween activated tokens and network predictions for target classes. Our\nproposed method outperforms the state-of-the-art Relevance method in the\nAverage Drop-Coherence-Complexity (ADCC) metric by $4.58\\%$ to $5.80\\%$ and\ngenerates more localized saliency maps. Our experiments demonstrate the\neffectiveness of ViT-ReciproCAM and showcase its potential for understanding\nand debugging ViT models. Our proposed method provides an efficient and\neasy-to-implement alternative for generating visual explanations, without\nrequiring attention and gradient information, which can be beneficial for\nvarious applications in the field of computer vision.\n","authors":["Seok-Yong Byun","Wonju Lee"],"pdf_url":"https://arxiv.org/pdf/2310.02588v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.03004v1","updated":"2023-10-04T17:45:14Z","published":"2023-10-04T17:45:14Z","title":"Soft Convex Quantization: Revisiting Vector Quantization with Convex\n  Optimization","summary":"  Vector Quantization (VQ) is a well-known technique in deep learning for\nextracting informative discrete latent representations. VQ-embedded models have\nshown impressive results in a range of applications including image and speech\ngeneration. VQ operates as a parametric K-means algorithm that quantizes inputs\nusing a single codebook vector in the forward pass. While powerful, this\ntechnique faces practical challenges including codebook collapse,\nnon-differentiability and lossy compression. To mitigate the aforementioned\nissues, we propose Soft Convex Quantization (SCQ) as a direct substitute for\nVQ. SCQ works like a differentiable convex optimization (DCO) layer: in the\nforward pass, we solve for the optimal convex combination of codebook vectors\nthat quantize the inputs. In the backward pass, we leverage differentiability\nthrough the optimality conditions of the forward solution. We then introduce a\nscalable relaxation of the SCQ optimization and demonstrate its efficacy on the\nCIFAR-10, GTSRB and LSUN datasets. We train powerful SCQ autoencoder models\nthat significantly outperform matched VQ-based architectures, observing an\norder of magnitude better image reconstruction and codebook usage with\ncomparable quantization runtime.\n","authors":["Tanmay Gautam","Reid Pryzant","Ziyi Yang","Chenguang Zhu","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2310.03004v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.02674v1","updated":"2023-10-04T09:26:44Z","published":"2023-10-04T09:26:44Z","title":"Land-cover change detection using paired OpenStreetMap data and optical\n  high-resolution imagery via object-guided Transformer","summary":"  Optical high-resolution imagery and OpenStreetMap (OSM) data are two\nimportant data sources for land-cover change detection. Previous studies in\nthese two data sources focus on utilizing the information in OSM data to aid\nthe change detection on multi-temporal optical high-resolution images. This\npaper pioneers the direct detection of land-cover changes utilizing paired OSM\ndata and optical imagery, thereby broadening the horizons of change detection\ntasks to encompass more dynamic earth observations. To this end, we propose an\nobject-guided Transformer (ObjFormer) architecture by naturally combining the\nprevalent object-based image analysis (OBIA) technique with the advanced vision\nTransformer architecture. The introduction of OBIA can significantly reduce the\ncomputational overhead and memory burden in the self-attention module.\nSpecifically, the proposed ObjFormer has a hierarchical pseudo-siamese encoder\nconsisting of object-guided self-attention modules that extract representative\nfeatures of different levels from OSM data and optical images; a decoder\nconsisting of object-guided cross-attention modules can progressively recover\nthe land-cover changes from the extracted heterogeneous features. In addition\nto the basic supervised binary change detection task, this paper raises a new\nsemi-supervised semantic change detection task that does not require any\nmanually annotated land-cover labels of optical images to train semantic change\ndetectors. Two lightweight semantic decoders are added to ObjFormer to\naccomplish this task efficiently. A converse cross-entropy loss is designed to\nfully utilize the negative samples, thereby contributing to the great\nperformance improvement in this task. The first large-scale benchmark dataset\ncontaining 1,287 map-image pairs (1024$\\times$ 1024 pixels for each sample)\ncovering 40 regions on six continents ...(see the manuscript for the full\nabstract)\n","authors":["Hongruixuan Chen","Cuiling Lan","Jian Song","Clifford Broni-Bediako","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2310.02674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.10574v2","updated":"2023-10-04T07:52:19Z","published":"2022-07-20T13:37:57Z","title":"Co-Located Human-Human Interaction Analysis using Nonverbal Cues: A\n  Survey","summary":"  Automated co-located human-human interaction analysis has been addressed by\nthe use of nonverbal communication as measurable evidence of social and\npsychological phenomena. We survey the computing studies (since 2010) detecting\nphenomena related to social traits (e.g., leadership, dominance, personality\ntraits), social roles/relations, and interaction dynamics (e.g., group\ncohesion, engagement, rapport). Our target is to identify the nonverbal cues\nand computational methodologies resulting in effective performance. This survey\ndiffers from its counterparts by involving the widest spectrum of social\nphenomena and interaction settings (free-standing conversations, meetings,\ndyads, and crowds). We also present a comprehensive summary of the related\ndatasets and outline future research directions which are regarding the\nimplementation of artificial intelligence, dataset curation, and\nprivacy-preserving interaction analysis. Some major observations are: the most\noften used nonverbal cue, computational method, interaction environment, and\nsensing approach are speaking activity, support vector machines, and meetings\ncomposed of 3-4 persons equipped with microphones and cameras, respectively;\nmultimodal features are prominently performing better; deep learning\narchitectures showed improved performance in overall, but there exist many\nphenomena whose detection has never been implemented through deep models. We\nalso identified several limitations such as the lack of scalable benchmarks,\nannotation reliability tests, cross-dataset experiments, and explainability\nanalysis.\n","authors":["Cigdem Beyan","Alessandro Vinciarelli","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2207.10574v2.pdf","comment":"This is the author's version of the work. It is posted here for your\n  personal use. Not for redistribution. The definitive version was published in\n  ACM Computing Surveys, https://doi.org/10.1145/3626516"},{"id":"http://arxiv.org/abs/2305.03048v2","updated":"2023-10-04T01:15:21Z","published":"2023-05-04T17:59:36Z","title":"Personalize Segment Anything Model with One Shot","summary":"  Driven by large-data pre-training, Segment Anything Model (SAM) has been\ndemonstrated as a powerful and promptable framework, revolutionizing the\nsegmentation models. Despite the generality, customizing SAM for specific\nvisual concepts without man-powered prompting is under explored, e.g.,\nautomatically segmenting your pet dog in different images. In this paper, we\npropose a training-free Personalization approach for SAM, termed as PerSAM.\nGiven only a single image with a reference mask, PerSAM first localizes the\ntarget concept by a location prior, and segments it within other images or\nvideos via three techniques: target-guided attention, target-semantic\nprompting, and cascaded post-refinement. In this way, we effectively adapt SAM\nfor private use without any training. To further alleviate the mask ambiguity,\nwe present an efficient one-shot fine-tuning variant, PerSAM-F. Freezing the\nentire SAM, we introduce two learnable weights for multi-scale masks, only\ntraining 2 parameters within 10 seconds for improved performance. To\ndemonstrate our efficacy, we construct a new segmentation dataset, PerSeg, for\npersonalized evaluation, and test our methods on video object segmentation with\ncompetitive performance. Besides, our approach can also enhance DreamBooth to\npersonalize Stable Diffusion for text-to-image generation, which discards the\nbackground disturbance for better target appearance learning. Code is released\nat https://github.com/ZrrSkywalker/Personalize-SAM\n","authors":["Renrui Zhang","Zhengkai Jiang","Ziyu Guo","Shilin Yan","Junting Pan","Xianzheng Ma","Hao Dong","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.03048v2.pdf","comment":"Code is available at https://github.com/ZrrSkywalker/Personalize-SAM"},{"id":"http://arxiv.org/abs/2310.03140v1","updated":"2023-10-04T20:05:40Z","published":"2023-10-04T20:05:40Z","title":"ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time\n  Measurements","summary":"  Tracking subjects in videos is one of the most widely used functions in\ncamera-based IoT applications such as security surveillance, smart city traffic\nsafety enhancement, vehicle to pedestrian communication and so on. In the\ncomputer vision domain, tracking is usually achieved by first detecting\nsubjects with bounding boxes, then associating detected bounding boxes across\nvideo frames. For many IoT systems, images captured by cameras are usually sent\nover the network to be processed at a different site that has more powerful\ncomputing resources than edge devices. However, sending entire frames through\nthe network causes significant bandwidth consumption that may exceed the system\nbandwidth constraints. To tackle this problem, we propose ViFiT, a\ntransformer-based model that reconstructs vision bounding box trajectories from\nphone data (IMU and Fine Time Measurements). It leverages a transformer ability\nof better modeling long-term time series data. ViFiT is evaluated on Vi-Fi\nDataset, a large-scale multimodal dataset in 5 diverse real world scenes,\nincluding indoor and outdoor environments. To fill the gap of proper metrics of\njointly capturing the system characteristics of both tracking quality and video\nbandwidth reduction, we propose a novel evaluation framework dubbed Minimum\nRequired Frames (MRF) and Minimum Required Frames Ratio (MRFR). ViFiT achieves\nan MRFR of 0.65 that outperforms the state-of-the-art approach for cross-modal\nreconstruction in LSTM Encoder-Decoder architecture X-Translator of 0.98,\nresulting in a high frame reduction rate as 97.76%.\n","authors":["Bryan Bo Cao","Abrar Alali","Hansi Liu","Nicholas Meegan","Marco Gruteser","Kristin Dana","Ashwin Ashok","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2310.03140v1.pdf","comment":"22 pages, 12 figures, 9 tables. MobiCom 2023 ISACom"}]},"2023-10-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.03744v1","updated":"2023-10-05T17:59:56Z","published":"2023-10-05T17:59:56Z","title":"Improved Baselines with Visual Instruction Tuning","summary":"  Large multimodal models (LMM) have recently shown encouraging progress with\nvisual instruction tuning. In this note, we show that the fully-connected\nvision-language cross-modal connector in LLaVA is surprisingly powerful and\ndata-efficient. With simple modifications to LLaVA, namely, using\nCLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA\ndata with simple response formatting prompts, we establish stronger baselines\nthat achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint\nuses merely 1.2M publicly available data, and finishes full training in ~1 day\non a single 8-A100 node. We hope this can make state-of-the-art LMM research\nmore accessible. Code and model will be publicly available.\n","authors":["Haotian Liu","Chunyuan Li","Yuheng Li","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2310.03744v1.pdf","comment":"Tech report, 4 pages. LLaVA project page: https://llava-vl.github.io"},{"id":"http://arxiv.org/abs/2310.03731v1","updated":"2023-10-05T17:52:09Z","published":"2023-10-05T17:52:09Z","title":"MathCoder: Seamless Code Integration in LLMs for Enhanced Mathematical\n  Reasoning","summary":"  The recently released GPT-4 Code Interpreter has demonstrated remarkable\nproficiency in solving challenging math problems, primarily attributed to its\nability to seamlessly reason with natural language, generate code, execute\ncode, and continue reasoning based on the execution output. In this paper, we\npresent a method to fine-tune open-source language models, enabling them to use\ncode for modeling and deriving math equations and, consequently, enhancing\ntheir mathematical reasoning abilities. We propose a method of generating novel\nand high-quality datasets with math problems and their code-based solutions,\nreferred to as MathCodeInstruct. Each solution interleaves natural language,\ncode, and execution results. We also introduce a customized supervised\nfine-tuning and inference approach. This approach yields the MathCoder models,\na family of models capable of generating code-based solutions for solving\nchallenging math problems. Impressively, the MathCoder models achieve\nstate-of-the-art scores among open-source LLMs on the MATH (45.2%) and GSM8K\n(83.9%) datasets, substantially outperforming other open-source alternatives.\nNotably, the MathCoder model not only surpasses ChatGPT-3.5 and PaLM-2 on GSM8K\nand MATH but also outperforms GPT-4 on the competition-level MATH dataset. The\ndataset and models will be released at https://github.com/mathllm/MathCoder.\n","authors":["Ke Wang","Houxing Ren","Aojun Zhou","Zimu Lu","Sichun Luo","Weikang Shi","Renrui Zhang","Linqi Song","Mingjie Zhan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2310.03731v1.pdf","comment":"The state-of-the-art open-source language models for mathematical\n  reasoning"},{"id":"http://arxiv.org/abs/2310.03724v1","updated":"2023-10-05T17:44:37Z","published":"2023-10-05T17:44:37Z","title":"Modular Speech-to-Text Translation for Zero-Shot Cross-Modal Transfer","summary":"  Recent research has shown that independently trained encoders and decoders,\ncombined through a shared fixed-size representation, can achieve competitive\nperformance in speech-to-text translation. In this work, we show that this type\nof approach can be further improved with multilingual training. We observe\nsignificant improvements in zero-shot cross-modal speech translation, even\noutperforming a supervised approach based on XLSR for several languages.\n","authors":["Paul-Ambroise Duquenne","Holger Schwenk","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2310.03724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03716v1","updated":"2023-10-05T17:38:28Z","published":"2023-10-05T17:38:28Z","title":"A Long Way to Go: Investigating Length Correlations in RLHF","summary":"  Great successes have been reported using Reinforcement Learning from Human\nFeedback (RLHF) to align large language models. Open-source preference datasets\nand reward models have enabled wider experimentation beyond generic chat\nsettings, particularly to make systems more \"helpful\" for tasks like web\nquestion answering, summarization, and multi-turn dialogue. When optimizing for\nhelpfulness, RLHF has been consistently observed to drive models to produce\nlonger outputs. This paper demonstrates that optimizing for response length is\na significant factor behind RLHF's reported improvements in these settings.\nFirst, we study the relationship between reward and length for reward models\ntrained on three open-source preference datasets for helpfulness. Here, length\ncorrelates strongly with reward, and improvements in reward score are driven in\nlarge part by shifting the distribution over output lengths. We then explore\ninterventions during both RL and reward model learning to see if we can achieve\nthe same downstream improvements as RLHF without increasing length. While our\ninterventions mitigate length increases, they aren't uniformly effective across\nsettings. Furthermore, we find that even running RLHF with a reward based\nsolely on length can reproduce most of the downstream improvements over the\ninitial policy model, showing that reward models in these settings have a long\nway to go.\n","authors":["Prasann Singhal","Tanya Goyal","Jiacheng Xu","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2310.03716v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.03714v1","updated":"2023-10-05T17:37:25Z","published":"2023-10-05T17:37:25Z","title":"DSPy: Compiling Declarative Language Model Calls into Self-Improving\n  Pipelines","summary":"  The ML community is rapidly exploring techniques for prompting language\nmodels (LMs) and for stacking them into pipelines that solve complex tasks.\nUnfortunately, existing LM pipelines are typically implemented using hard-coded\n\"prompt templates\", i.e. lengthy strings discovered via trial and error. Toward\na more systematic approach for developing and optimizing LM pipelines, we\nintroduce DSPy, a programming model that abstracts LM pipelines as text\ntransformation graphs, i.e. imperative computational graphs where LMs are\ninvoked through declarative modules. DSPy modules are parameterized, meaning\nthey can learn (by creating and collecting demonstrations) how to apply\ncompositions of prompting, finetuning, augmentation, and reasoning techniques.\nWe design a compiler that will optimize any DSPy pipeline to maximize a given\nmetric. We conduct two case studies, showing that succinct DSPy programs can\nexpress and optimize sophisticated LM pipelines that reason about math word\nproblems, tackle multi-hop retrieval, answer complex questions, and control\nagent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and\nllama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot\nprompting (generally by over 25% and 65%, respectively) and pipelines with\nexpert-created demonstrations (by up to 5-46% and 16-40%, respectively). On top\nof that, DSPy programs compiled to open and relatively small LMs like\n770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely\non expert-written prompt chains for proprietary GPT-3.5. DSPy is available at\nhttps://github.com/stanfordnlp/dspy\n","authors":["Omar Khattab","Arnav Singhvi","Paridhi Maheshwari","Zhiyuan Zhang","Keshav Santhanam","Sri Vardhamanan","Saiful Haq","Ashutosh Sharma","Thomas T. Joshi","Hanna Moazam","Heather Miller","Matei Zaharia","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2310.03714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03710v1","updated":"2023-10-05T17:36:16Z","published":"2023-10-05T17:36:16Z","title":"Agent Instructs Large Language Models to be General Zero-Shot Reasoners","summary":"  We introduce a method to improve the zero-shot reasoning abilities of large\nlanguage models on general language understanding tasks. Specifically, we build\nan autonomous agent to instruct the reasoning process of large language models.\nWe show this approach further unleashes the zero-shot reasoning abilities of\nlarge language models to more tasks. We study the performance of our method on\na wide set of datasets spanning generation, classification, and reasoning. We\nshow that our method generalizes to most tasks and obtains state-of-the-art\nzero-shot performance on 20 of the 29 datasets that we evaluate. For instance,\nour method boosts the performance of state-of-the-art large language models by\na large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and\nGPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement\nin reasoning is striking, with an average increase of 10.5%. With our method,\nLlama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%.\n","authors":["Nicholas Crispino","Kyle Montgomery","Fankun Zeng","Dawn Song","Chenguang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03693v1","updated":"2023-10-05T17:12:17Z","published":"2023-10-05T17:12:17Z","title":"Fine-tuning Aligned Language Models Compromises Safety, Even When Users\n  Do Not Intend To!","summary":"  Optimizing large language models (LLMs) for downstream use cases often\ninvolves the customization of pre-trained LLMs through further fine-tuning.\nMeta's open release of Llama models and OpenAI's APIs for fine-tuning GPT-3.5\nTurbo on custom datasets also encourage this practice. But, what are the safety\ncosts associated with such custom fine-tuning? We note that while existing\nsafety alignment infrastructures can restrict harmful behaviors of LLMs at\ninference time, they do not cover safety risks when fine-tuning privileges are\nextended to end-users. Our red teaming studies find that the safety alignment\nof LLMs can be compromised by fine-tuning with only a few adversarially\ndesigned training examples. For instance, we jailbreak GPT-3.5 Turbo's safety\nguardrails by fine-tuning it on only 10 such examples at a cost of less than\n$0.20 via OpenAI's APIs, making the model responsive to nearly any harmful\ninstructions. Disconcertingly, our research also reveals that, even without\nmalicious intent, simply fine-tuning with benign and commonly used datasets can\nalso inadvertently degrade the safety alignment of LLMs, though to a lesser\nextent. These findings suggest that fine-tuning aligned LLMs introduces new\nsafety risks that current safety infrastructures fall short of addressing --\neven if a model's initial safety alignment is impeccable, it is not necessarily\nto be maintained after custom fine-tuning. We outline and critically analyze\npotential mitigations and advocate for further research efforts toward\nreinforcing safety protocols for the custom fine-tuning of aligned LLMs.\n","authors":["Xiangyu Qi","Yi Zeng","Tinghao Xie","Pin-Yu Chen","Ruoxi Jia","Prateek Mittal","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2310.03693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03686v1","updated":"2023-10-05T17:04:59Z","published":"2023-10-05T17:04:59Z","title":"DecoderLens: Layerwise Interpretation of Encoder-Decoder Transformers","summary":"  In recent years, many interpretability methods have been proposed to help\ninterpret the internal states of Transformer-models, at different levels of\nprecision and complexity. Here, to analyze encoder-decoder Transformers, we\npropose a simple, new method: DecoderLens. Inspired by the LogitLens (for\ndecoder-only Transformers), this method involves allowing the decoder to\ncross-attend representations of intermediate encoder layers instead of using\nthe final encoder output, as is normally done in encoder-decoder models. The\nmethod thus maps previously uninterpretable vector representations to\nhuman-interpretable sequences of words or symbols. We report results from the\nDecoderLens applied to models trained on question answering, logical reasoning,\nspeech recognition and machine translation. The DecoderLens reveals several\nspecific subtasks that are solved at low or intermediate layers, shedding new\nlight on the information flow inside the encoder component of this important\nclass of models.\n","authors":["Anna Langedijk","Hosein Mohebbi","Gabriele Sarti","Willem Zuidema","Jaap Jumelet"],"pdf_url":"https://arxiv.org/pdf/2310.03686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03668v1","updated":"2023-10-05T16:43:13Z","published":"2023-10-05T16:43:13Z","title":"GoLLIE: Annotation Guidelines improve Zero-Shot Information-Extraction","summary":"  Large Language Models (LLMs) combined with instruction tuning have made\nsignificant progress when generalizing to unseen tasks. However, they have been\nless successful in Information Extraction (IE), lagging behind task-specific\nmodels. Typically, IE tasks are characterized by complex annotation guidelines\nwhich describe the task and give examples to humans. Previous attempts to\nleverage such information have failed, even with the largest models, as they\nare not able to follow the guidelines out-of-the-box. In this paper we propose\nGoLLIE (Guideline-following Large Language Model for IE), a model able to\nimprove zero-shot results on unseen IE tasks by virtue of being fine-tuned to\ncomply with annotation guidelines. Comprehensive evaluation empirically\ndemonstrates that GoLLIE is able to generalize to and follow unseen guidelines,\noutperforming previous attempts at zero-shot information extraction. The\nablation study shows that detailed guidelines is key for good results.\n","authors":["Oscar Sainz","Iker García-Ferrero","Rodrigo Agerri","Oier Lopez de Lacalle","German Rigau","Eneko Agirre"],"pdf_url":"https://arxiv.org/pdf/2310.03668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03666v1","updated":"2023-10-05T16:43:04Z","published":"2023-10-05T16:43:04Z","title":"MapperGPT: Large Language Models for Linking and Mapping Entities","summary":"  Aligning terminological resources, including ontologies, controlled\nvocabularies, taxonomies, and value sets is a critical part of data integration\nin many domains such as healthcare, chemistry, and biomedical research. Entity\nmapping is the process of determining correspondences between entities across\nthese resources, such as gene identifiers, disease concepts, or chemical entity\nidentifiers. Many tools have been developed to compute such mappings based on\ncommon structural features and lexical information such as labels and synonyms.\nLexical approaches in particular often provide very high recall, but low\nprecision, due to lexical ambiguity. As a consequence of this, mapping efforts\noften resort to a labor intensive manual mapping refinement through a human\ncurator.\n  Large Language Models (LLMs), such as the ones employed by ChatGPT, have\ngeneralizable abilities to perform a wide range of tasks, including\nquestion-answering and information extraction. Here we present MapperGPT, an\napproach that uses LLMs to review and refine mapping relationships as a\npost-processing step, in concert with existing high-recall methods that are\nbased on lexical and structural heuristics.\n  We evaluated MapperGPT on a series of alignment tasks from different domains,\nincluding anatomy, developmental biology, and renal diseases. We devised a\ncollection of tasks that are designed to be particularly challenging for\nlexical methods. We show that when used in combination with high-recall\nmethods, MapperGPT can provide a substantial improvement in accuracy, beating\nstate-of-the-art (SOTA) methods such as LogMap.\n","authors":["Nicolas Matentzoglu","J. Harry Caufield","Harshad B. Hegde","Justin T. Reese","Sierra Moxon","Hyeongsik Kim","Nomi L. Harris","Melissa A Haendel","Christopher J. Mungall"],"pdf_url":"https://arxiv.org/pdf/2310.03666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02954v2","updated":"2023-10-05T16:24:58Z","published":"2023-10-04T16:44:37Z","title":"DQ-LoRe: Dual Queries with Low Rank Approximation Re-ranking for\n  In-Context Learning","summary":"  Recent advances in natural language processing, primarily propelled by Large\nLanguage Models (LLMs), have showcased their remarkable capabilities grounded\nin in-context learning. A promising avenue for guiding LLMs in intricate\nreasoning tasks involves the utilization of intermediate reasoning steps within\nthe Chain-of-Thought (CoT) paradigm. Nevertheless, the central challenge lies\nin the effective selection of exemplars for facilitating in-context learning.\nIn this study, we introduce a framework that leverages Dual Queries and\nLow-rank approximation Re-ranking (DQ-LoRe) to automatically select exemplars\nfor in-context learning. Dual Queries first query LLM to obtain LLM-generated\nknowledge such as CoT, then query the retriever to obtain the final exemplars\nvia both question and the knowledge. Moreover, for the second query, LoRe\nemploys dimensionality reduction techniques to refine exemplar selection,\nensuring close alignment with the input question's knowledge. Through extensive\nexperiments, we demonstrate that DQ-LoRe significantly outperforms prior\nstate-of-the-art methods in the automatic selection of exemplars for GPT-4,\nenhancing performance from 92.5\\% to 94.2\\%. Our comprehensive analysis further\nreveals that DQ-LoRe consistently outperforms retrieval-based approaches in\nterms of both performance and adaptability, especially in scenarios\ncharacterized by distribution shifts. DQ-LoRe pushes the boundaries of\nin-context learning and opens up new avenues for addressing complex reasoning\nchallenges. We will release the code soon.\n","authors":["Jiong Xiong","Zixuan Li","Chuanyang Zheng","Zhijiang Guo","Yichun Yin","Enze Xie","Zhicheng Yang","Qingxing Cao","Haiming Wang","Xiongwei Han","Jing Tang","Chengming Li","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2310.02954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03646v1","updated":"2023-10-05T16:21:36Z","published":"2023-10-05T16:21:36Z","title":"TRAM: Bridging Trust Regions and Sharpness Aware Minimization","summary":"  By reducing the curvature of the loss surface in the parameter space,\nSharpness-aware minimization (SAM) yields widespread robustness improvement\nunder domain transfer. Instead of focusing on parameters, however, this work\nconsiders the transferability of representations as the optimization target for\nout-of-domain generalization in a fine-tuning setup. To encourage the retention\nof transferable representations, we consider trust region-based fine-tuning\nmethods, which exploit task-specific skills without forgetting task-agnostic\nrepresentations from pre-training. We unify parameter- and representation-space\nsmoothing approaches by using trust region bounds to inform SAM-style\nregularizers on both of these optimization surfaces. We propose Trust Region\nAware Minimization (TRAM), a fine-tuning algorithm that optimizes for flat\nminima and smooth, informative representations without forgetting pre-trained\nstructure. We find that TRAM outperforms both sharpness-aware and trust\nregion-based optimization methods on cross-domain language modeling and\ncross-lingual transfer, where robustness to domain transfer and representation\ngenerality are critical for success. TRAM establishes a new standard in\ntraining generalizable models with minimal additional computation.\n","authors":["Tom Sherborne","Naomi Saphra","Pradeep Dasigi","Hao Peng"],"pdf_url":"https://arxiv.org/pdf/2310.03646v1.pdf","comment":"17 pages, 11 tables, 1 figure. Submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.03639v1","updated":"2023-10-05T16:11:14Z","published":"2023-10-05T16:11:14Z","title":"Evaluating Self-Supervised Speech Representations for Indigenous\n  American Languages","summary":"  The application of self-supervision to speech representation learning has\ngarnered significant interest in recent years, due to its scalability to large\namounts of unlabeled data. However, much progress, both in terms of\npre-training and downstream evaluation, has remained concentrated in\nmonolingual models that only consider English. Few models consider other\nlanguages, and even fewer consider indigenous ones. In our submission to the\nNew Language Track of the ASRU 2023 ML-SUPERB Challenge, we present an ASR\ncorpus for Quechua, an indigenous South American Language. We benchmark the\nefficacy of large SSL models on Quechua, along with 6 other indigenous\nlanguages such as Guarani and Bribri, on low-resource ASR. Our results show\nsurprisingly strong performance by state-of-the-art SSL models, showing the\npotential generalizability of large-scale models to real-world data.\n","authors":["Chih-Chen Chen","William Chen","Rodolfo Zevallos","John Ortega"],"pdf_url":"https://arxiv.org/pdf/2310.03639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03635v1","updated":"2023-10-05T16:09:48Z","published":"2023-10-05T16:09:48Z","title":"CLEVRER-Humans: Describing Physical and Causal Events the Human Way","summary":"  Building machines that can reason about physical events and their causal\nrelationships is crucial for flexible interaction with the physical world.\nHowever, most existing physical and causal reasoning benchmarks are exclusively\nbased on synthetically generated events and synthetic natural language\ndescriptions of causal relationships. This design brings up two issues. First,\nthere is a lack of diversity in both event types and natural language\ndescriptions; second, causal relationships based on manually-defined heuristics\nare different from human judgments. To address both shortcomings, we present\nthe CLEVRER-Humans benchmark, a video reasoning dataset for causal judgment of\nphysical events with human labels. We employ two techniques to improve data\ncollection efficiency: first, a novel iterative event cloze task to elicit a\nnew representation of events in videos, which we term Causal Event Graphs\n(CEGs); second, a data augmentation technique based on neural language\ngenerative models. We convert the collected CEGs into questions and answers to\nbe consistent with prior work. Finally, we study a collection of baseline\napproaches for CLEVRER-Humans question-answering, highlighting the great\nchallenges set forth by our benchmark.\n","authors":["Jiayuan Mao","Xuelin Yang","Xikun Zhang","Noah D. Goodman","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2310.03635v1.pdf","comment":"NeurIPS 2022 (Dataset and Benchmark Track). First two authors\n  contributed equally. Project page:\n  https://sites.google.com/stanford.edu/clevrer-humans/home"},{"id":"http://arxiv.org/abs/2305.12766v2","updated":"2023-10-05T16:04:43Z","published":"2023-05-22T06:45:02Z","title":"Explaining Emergent In-Context Learning as Kernel Regression","summary":"  Large language models (LLMs) have initiated a paradigm shift in transfer\nlearning. In contrast to the classic pretraining-then-finetuning procedure, in\norder to use LLMs for downstream prediction tasks, one only needs to provide a\nfew demonstrations, known as in-context examples, without adding more or\nupdating existing model parameters. This in-context learning (ICL) capability\nof LLMs is intriguing, and it is not yet fully understood how pretrained LLMs\nacquire such capabilities. In this paper, we investigate the reason why a\ntransformer-based language model can accomplish in-context learning after\npre-training on a general language corpus by proposing one hypothesis that LLMs\ncan simulate kernel regression with internal representations when faced with\nin-context examples. More concretely, we first prove that Bayesian inference on\nin-context prompts can be asymptotically understood as kernel regression $\\hat\ny = \\sum_i y_i K(x, x_i)/\\sum_i K(x, x_i)$ as the number of in-context\ndemonstrations grows. Then, we empirically investigate the in-context behaviors\nof language models. We find that during ICL, the attention and hidden features\nin LLMs match the behaviors of a kernel regression. Finally, our theory\nprovides insights into multiple phenomena observed in the ICL field: why\nretrieving demonstrative samples similar to test samples can help, why ICL\nperformance is sensitive to the output formats, and why ICL accuracy benefits\nfrom selecting in-distribution and representative samples.\n","authors":["Chi Han","Ziqi Wang","Han Zhao","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2305.12766v2.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.02754v2","updated":"2023-10-05T14:28:20Z","published":"2023-10-04T11:49:37Z","title":"LC-Score: Reference-less estimation of Text Comprehension Difficulty","summary":"  Being able to read and understand written text is critical in a digital era.\nHowever, studies shows that a large fraction of the population experiences\ncomprehension issues. In this context, further initiatives in accessibility are\nrequired to improve the audience text comprehension. However, writers are\nhardly assisted nor encouraged to produce easy-to-understand content. Moreover,\nAutomatic Text Simplification (ATS) model development suffers from the lack of\nmetric to accurately estimate comprehension difficulty We present\n\\textsc{LC-Score}, a simple approach for training text comprehension metric for\nany French text without reference \\ie predicting how easy to understand a given\ntext is on a $[0, 100]$ scale. Our objective with this scale is to\nquantitatively capture the extend to which a text suits to the \\textit{Langage\nClair} (LC, \\textit{Clear Language}) guidelines, a French initiative closely\nrelated to English Plain Language. We explore two approaches: (i) using\nlinguistically motivated indicators used to train statistical models, and (ii)\nneural learning directly from text leveraging pre-trained language models. We\nintroduce a simple proxy task for comprehension difficulty training as a\nclassification task. To evaluate our models, we run two distinct human\nannotation experiments, and find that both approaches (indicator based and\nneural) outperforms commonly used readability and comprehension metrics such as\nFKGL.\n","authors":["Paul Tardy","Charlotte Roze","Paul Poupet"],"pdf_url":"https://arxiv.org/pdf/2310.02754v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04054v6","updated":"2023-10-05T14:19:32Z","published":"2023-02-08T13:47:00Z","title":"Towards Inferential Reproducibility of Machine Learning Research","summary":"  Reliability of machine learning evaluation -- the consistency of observed\nevaluation scores across replicated model training runs -- is affected by\nseveral sources of nondeterminism which can be regarded as measurement noise.\nCurrent tendencies to remove noise in order to enforce reproducibility of\nresearch results neglect inherent nondeterminism at the implementation level\nand disregard crucial interaction effects between algorithmic noise factors and\ndata properties. This limits the scope of conclusions that can be drawn from\nsuch experiments. Instead of removing noise, we propose to incorporate several\nsources of variance, including their interaction with data properties, into an\nanalysis of significance and reliability of machine learning evaluation, with\nthe aim to draw inferences beyond particular instances of trained models. We\nshow how to use linear mixed effects models (LMEMs) to analyze performance\nevaluation scores, and to conduct statistical inference with a generalized\nlikelihood ratio test (GLRT). This allows us to incorporate arbitrary sources\nof noise like meta-parameter variations into statistical significance testing,\nand to assess performance differences conditional on data properties.\nFurthermore, a variance component analysis (VCA) enables the analysis of the\ncontribution of noise sources to overall variance and the computation of a\nreliability coefficient by the ratio of substantial to total variance.\n","authors":["Michael Hagmann","Philipp Meier","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2302.04054v6.pdf","comment":"Published at ICLR 2023"},{"id":"http://arxiv.org/abs/2310.03560v1","updated":"2023-10-05T14:18:40Z","published":"2023-10-05T14:18:40Z","title":"Redefining Digital Health Interfaces with Large Language Models","summary":"  Digital health tools have the potential to significantly improve the delivery\nof healthcare services. However, their use remains comparatively limited due,\nin part, to challenges surrounding usability and trust. Recently, Large\nLanguage Models (LLMs) have emerged as general-purpose models with the ability\nto process complex information and produce human-quality text, presenting a\nwealth of potential applications in healthcare. Directly applying LLMs in\nclinical settings is not straightforward, with LLMs susceptible to providing\ninconsistent or nonsensical answers. We demonstrate how LLMs can utilize\nexternal tools to provide a novel interface between clinicians and digital\ntechnologies. This enhances the utility and practical impact of digital\nhealthcare tools and AI models while addressing current issues with using LLM\nin clinical settings such as hallucinations. We illustrate our approach with\nexamples from cardiovascular disease and diabetes risk prediction, highlighting\nthe benefit compared to traditional interfaces for digital tools.\n","authors":["Fergus Imrie","Paulius Rauba","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2310.03560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09350v2","updated":"2023-10-05T14:17:39Z","published":"2023-01-23T10:33:22Z","title":"Large-scale investigation of weakly-supervised deep learning for the\n  fine-grained semantic indexing of biomedical literature","summary":"  Objective: Semantic indexing of biomedical literature is usually done at the\nlevel of MeSH descriptors with several related but distinct biomedical concepts\noften grouped together and treated as a single topic. This study proposes a new\nmethod for the automated refinement of subject annotations at the level of MeSH\nconcepts. Methods: Lacking labelled data, we rely on weak supervision based on\nconcept occurrence in the abstract of an article, which is also enhanced by\ndictionary-based heuristics. In addition, we investigate deep learning\napproaches, making design choices to tackle the particular challenges of this\ntask. The new method is evaluated on a large-scale retrospective scenario,\nbased on concepts that have been promoted to descriptors. Results: In our\nexperiments concept occurrence was the strongest heuristic achieving a macro-F1\nscore of about 0.63 across several labels. The proposed method improved it\nfurther by more than 4pp. Conclusion: The results suggest that concept\noccurrence is a strong heuristic for refining the coarse-grained labels at the\nlevel of MeSH concepts and the proposed method improves it further.\n","authors":["Anastasios Nentidis","Thomas Chatzopoulos","Anastasia Krithara","Grigorios Tsoumakas","Georgios Paliouras"],"pdf_url":"https://arxiv.org/pdf/2301.09350v2.pdf","comment":"26 pages, 5 figures, 4 tables. A more concise version"},{"id":"http://arxiv.org/abs/2308.00436v3","updated":"2023-10-05T12:59:59Z","published":"2023-08-01T10:31:36Z","title":"SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step\n  Reasoning","summary":"  The recent progress in large language models (LLMs), especially the invention\nof chain-of-thought prompting, has made it possible to automatically answer\nquestions by stepwise reasoning. However, when faced with more complicated\nproblems that require non-linear thinking, even the strongest LLMs make\nmistakes. To address this, we explore whether LLMs are able to recognize errors\nin their own step-by-step reasoning, without resorting to external resources.\nTo this end, we propose SelfCheck, a general-purpose zero-shot verification\nschema for recognizing such errors. We then use the results of these checks to\nimprove question-answering performance by conducting weighted voting on\nmultiple solutions to the question. We test SelfCheck on three datasets (GSM8K,\nMathQA, and MATH) and find that it successfully recognizes errors and, in turn,\nincreases final answer accuracies.\n","authors":["Ning Miao","Yee Whye Teh","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2308.00436v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03518v1","updated":"2023-10-05T12:59:57Z","published":"2023-10-05T12:59:57Z","title":"Towards Robust and Generalizable Training: An Empirical Study of Noisy\n  Slot Filling for Input Perturbations","summary":"  In real dialogue scenarios, as there are unknown input noises in the\nutterances, existing supervised slot filling models often perform poorly in\npractical applications. Even though there are some studies on noise-robust\nmodels, these works are only evaluated on rule-based synthetic datasets, which\nis limiting, making it difficult to promote the research of noise-robust\nmethods. In this paper, we introduce a noise robustness evaluation dataset\nnamed Noise-SF for slot filling task. The proposed dataset contains five types\nof human-annotated noise, and all those noises are exactly existed in real\nextensive robust-training methods of slot filling into the proposed framework.\nBy conducting exhaustive empirical evaluation experiments on Noise-SF, we find\nthat baseline models have poor performance in robustness evaluation, and the\nproposed framework can effectively improve the robustness of models. Based on\nthe empirical experimental results, we make some forward-looking suggestions to\nfuel the research in this direction. Our dataset Noise-SF will be released at\nhttps://github.com/dongguanting/Noise-SF.\n","authors":["Jiachi Liu","Liwen Wang","Guanting Dong","Xiaoshuai Song","Zechen Wang","Zhengyang Wang","Shanglin Lei","Jinzheng Zhao","Keqing He","Bo Xiao","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.03518v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2310.02357v2","updated":"2023-10-05T12:36:19Z","published":"2023-10-03T18:32:34Z","title":"On the definition of toxicity in NLP","summary":"  The fundamental problem in toxicity detection task lies in the fact that the\ntoxicity is ill-defined. This causes us to rely on subjective and vague data in\nmodels' training, which results in non-robust and non-accurate results: garbage\nin - garbage out.\n  This work suggests a new, stress-level-based definition of toxicity designed\nto be objective and context-aware. On par with it, we also describe possible\nways of applying this new definition to dataset creation and model training.\n","authors":["Sergey Berezin","Reza Farahbakhsh","Noel Crespi"],"pdf_url":"https://arxiv.org/pdf/2310.02357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17147v2","updated":"2023-10-05T12:25:18Z","published":"2023-09-29T11:19:15Z","title":"Using Large Language Models for Qualitative Analysis can Introduce\n  Serious Bias","summary":"  Large Language Models (LLMs) are quickly becoming ubiquitous, but the\nimplications for social science research are not yet well understood. This\npaper asks whether LLMs can help us analyse large-N qualitative data from\nopen-ended interviews, with an application to transcripts of interviews with\nRohingya refugees in Cox's Bazaar, Bangladesh. We find that a great deal of\ncaution is needed in using LLMs to annotate text as there is a risk of\nintroducing biases that can lead to misleading inferences. We here mean bias in\nthe technical sense, that the errors that LLMs make in annotating interview\ntranscripts are not random with respect to the characteristics of the interview\nsubjects. Training simpler supervised models on high-quality human annotations\nwith flexible coding leads to less measurement error and bias than LLM\nannotations. Therefore, given that some high quality annotations are necessary\nin order to asses whether an LLM introduces bias, we argue that it is probably\npreferable to train a bespoke model on these annotations than it is to use an\nLLM for annotation.\n","authors":["Julian Ashwin","Aditya Chhabra","Vijayendra Rao"],"pdf_url":"https://arxiv.org/pdf/2309.17147v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03477v1","updated":"2023-10-05T11:45:29Z","published":"2023-10-05T11:45:29Z","title":"Tik-to-Tok: Translating Language Models One Token at a Time: An\n  Embedding Initialization Strategy for Efficient Language Adaptation","summary":"  Training monolingual language models for low and mid-resource languages is\nmade challenging by limited and often inadequate pretraining data. In this\nstudy, we propose a novel model conversion strategy to address this issue,\nadapting high-resources monolingual language models to a new target language.\nBy generalizing over a word translation dictionary encompassing both the source\nand target languages, we map tokens from the target tokenizer to semantically\nsimilar tokens from the source language tokenizer. This one-to-many token\nmapping improves tremendously the initialization of the embedding table for the\ntarget language. We conduct experiments to convert high-resource models to mid-\nand low-resource languages, namely Dutch and Frisian. These converted models\nachieve a new state-of-the-art performance on these languages across all sorts\nof downstream tasks. By reducing significantly the amount of data and time\nrequired for training state-of-the-art models, our novel model conversion\nstrategy has the potential to benefit many languages worldwide.\n","authors":["François Remy","Pieter Delobelle","Bettina Berendt","Kris Demuynck","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2310.03477v1.pdf","comment":"As first reviewed at TACL"},{"id":"http://arxiv.org/abs/2305.13716v3","updated":"2023-10-05T11:44:39Z","published":"2023-05-23T06:08:13Z","title":"BA-SOT: Boundary-Aware Serialized Output Training for Multi-Talker ASR","summary":"  The recently proposed serialized output training (SOT) simplifies\nmulti-talker automatic speech recognition (ASR) by generating speaker\ntranscriptions separated by a special token. However, frequent speaker changes\ncan make speaker change prediction difficult. To address this, we propose\nboundary-aware serialized output training (BA-SOT), which explicitly\nincorporates boundary knowledge into the decoder via a speaker change detection\ntask and boundary constraint loss. We also introduce a two-stage connectionist\ntemporal classification (CTC) strategy that incorporates token-level SOT CTC to\nrestore temporal context information. Besides typical character error rate\n(CER), we introduce utterance-dependent character error rate (UD-CER) to\nfurther measure the precision of speaker change prediction. Compared to\noriginal SOT, BA-SOT reduces CER/UD-CER by 5.1%/14.0%, and leveraging a\npre-trained ASR model for BA-SOT model initialization further reduces\nCER/UD-CER by 8.4%/19.9%.\n","authors":["Yuhao Liang","Fan Yu","Yangze Li","Pengcheng Guo","Shiliang Zhang","Qian Chen","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2305.13716v3.pdf","comment":"Accepted by INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2310.03473v1","updated":"2023-10-05T11:29:09Z","published":"2023-10-05T11:29:09Z","title":"Controllable Multi-document Summarization: Coverage & Coherence\n  Intuitive Policy with Large Language Model Based Rewards","summary":"  Memory-efficient large language models are good at refining text input for\nbetter readability. However, controllability is a matter of concern when it\ncomes to text generation tasks with long inputs, such as multi-document\nsummarization. In this work, we investigate for a generic controllable approach\nfor multi-document summarization that leverages the capabilities of LLMs to\nrefine the text. In particular, we train a controllable content extraction\nscheme to extract the text that will be refined by an LLM. The scheme is\ndesigned with a novel coverage and coherence intuitive policy, which is duly\nrewarded by a passively trained LLM. Our approach yields competitive results in\nthe evaluation using ROUGE metrics and outperforms potential baselines in\ncoherence, as per human evaluation.\n","authors":["Litton J Kurisinkel","Nancy F chen"],"pdf_url":"https://arxiv.org/pdf/2310.03473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.04939v3","updated":"2023-10-05T10:33:42Z","published":"2021-09-10T15:35:00Z","title":"Modeling Human Sentence Processing with Left-Corner Recurrent Neural\n  Network Grammars","summary":"  In computational linguistics, it has been shown that hierarchical structures\nmake language models (LMs) more human-like. However, the previous literature\nhas been agnostic about a parsing strategy of the hierarchical models. In this\npaper, we investigated whether hierarchical structures make LMs more\nhuman-like, and if so, which parsing strategy is most cognitively plausible. In\norder to address this question, we evaluated three LMs against human reading\ntimes in Japanese with head-final left-branching structures: Long Short-Term\nMemory (LSTM) as a sequential model and Recurrent Neural Network Grammars\n(RNNGs) with top-down and left-corner parsing strategies as hierarchical\nmodels. Our computational modeling demonstrated that left-corner RNNGs\noutperformed top-down RNNGs and LSTM, suggesting that hierarchical and\nleft-corner architectures are more cognitively plausible than top-down or\nsequential architectures. In addition, the relationships between the cognitive\nplausibility and (i) perplexity, (ii) parsing, and (iii) beam size will also be\ndiscussed.\n","authors":["Ryo Yoshida","Hiroshi Noji","Yohei Oseki"],"pdf_url":"https://arxiv.org/pdf/2109.04939v3.pdf","comment":"Accepted by EMNLP 2021"},{"id":"http://arxiv.org/abs/2310.03443v1","updated":"2023-10-05T10:29:18Z","published":"2023-10-05T10:29:18Z","title":"The North System for Formosa Speech Recognition Challenge 2023","summary":"  This report provides a concise overview of the proposed North system, which\naims to achieve automatic word/syllable recognition for Taiwanese Hakka\n(Sixian). The report outlines three key components of the system: the\nacquisition, composition, and utilization of the training data; the\narchitecture of the model; and the hardware specifications and operational\nstatistics. The demonstration of the system can be found at\nhttps://asrvm.iis.sinica.edu.tw/hakka_sixian.\n","authors":["Li-Wei Chen","Kai-Chen Cheng","Hung-Shin Lee"],"pdf_url":"https://arxiv.org/pdf/2310.03443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06435v4","updated":"2023-10-05T10:29:02Z","published":"2023-07-12T20:01:52Z","title":"A Comprehensive Overview of Large Language Models","summary":"  Large Language Models (LLMs) have recently demonstrated remarkable\ncapabilities in natural language processing tasks and beyond. This success of\nLLMs has led to a large influx of research contributions in this direction.\nThese works encompass diverse topics such as architectural innovations of the\nunderlying neural networks, context length improvements, model alignment,\ntraining datasets, benchmarking, efficiency and more. With the rapid\ndevelopment of techniques and regular breakthroughs in LLM research, it has\nbecome considerably challenging to perceive the bigger picture of the advances\nin this direction. Considering the rapidly emerging plethora of literature on\nLLMs, it is imperative that the research community is able to benefit from a\nconcise yet comprehensive overview of the recent developments in this field.\nThis article provides that overview to the research community. It not only\nfocuses on a systematic treatment of the existing literature on a broad range\nof LLM related concept, but also pays special attention to providing\ncomprehensive summaries with extensive details about the individual existing\nmodels, datasets and major insights. We also pay heed to aligning our overview\nwith the emerging outlook of this research direction by accounting for the\nother recently materializing reviews of the broader research direction of LLMs.\nOur self-contained comprehensive overview of LLMs discusses relevant background\nconcepts along with covering the advanced topics at the frontier of this\nresearch direction. This review article is intended to not only provide a\nsystematic survey, but also a quick comprehensive reference for the researchers\nand practitioners to draw insights from extensive informative summaries of the\nexisting works to advance the LLM research direction.\n","authors":["Humza Naveed","Asad Ullah Khan","Shi Qiu","Muhammad Saqib","Saeed Anwar","Muhammad Usman","Naveed Akhtar","Nick Barnes","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.06435v4.pdf","comment":"Work in-progress"},{"id":"http://arxiv.org/abs/2310.03424v1","updated":"2023-10-05T10:01:32Z","published":"2023-10-05T10:01:32Z","title":"Neural Language Model Pruning for Automatic Speech Recognition","summary":"  We study model pruning methods applied to Transformer-based neural network\nlanguage models for automatic speech recognition. We explore three aspects of\nthe pruning frame work, namely criterion, method and scheduler, analyzing their\ncontribution in terms of accuracy and inference speed. To the best of our\nknowledge, such in-depth analyses on large-scale recognition systems has not\nbeen reported in the literature. In addition, we propose a variant of low-rank\napproximation suitable for incrementally compressing models, and delivering\nmultiple models with varied target sizes. Among other results, we show that a)\ndata-driven pruning outperforms magnitude-driven in several scenarios; b)\nincremental pruning achieves higher accuracy compared to one-shot pruning,\nespecially when targeting smaller sizes; and c) low-rank approximation presents\nthe best trade-off between size reduction and inference speed-up for moderate\ncompression.\n","authors":["Leonardo Emili","Thiago Fraga-Silva","Ernest Pusateri","Markus Nußbaum-Thom","Youssef Oualil"],"pdf_url":"https://arxiv.org/pdf/2310.03424v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.02092v2","updated":"2023-10-05T09:42:50Z","published":"2023-09-05T09:56:29Z","title":"Bridging Emotion Role Labeling and Appraisal-based Emotion Analysis","summary":"  The term emotion analysis in text subsumes various natural language\nprocessing tasks which have in common the goal to enable computers to\nunderstand emotions. Most popular is emotion classification in which one or\nmultiple emotions are assigned to a predefined textual unit. While such setting\nis appropriate to identify the reader's or author's emotion, emotion role\nlabeling adds the perspective of mentioned entities and extracts text spans\nthat correspond to the emotion cause. The underlying emotion theories agree on\none important point; that an emotion is caused by some internal or external\nevent and comprises several subcomponents, including the subjective feeling and\na cognitive evaluation. We therefore argue that emotions and events are related\nin two ways. (1) Emotions are events; and this perspective is the fundament in\nNLP for emotion role labeling. (2) Emotions are caused by events; a perspective\nthat is made explicit with research how to incorporate psychological appraisal\ntheories in NLP models to interpret events. These two research directions, role\nlabeling and (event-focused) emotion classification, have by and large been\ntackled separately. We contributed to both directions with the projects SEAT\n(Structured Multi-Domain Emotion Analysis from Text) and CEAT (Computational\nEvent Evaluation based on Appraisal Theories for Emotion Analysis), both funded\nby the German Research Foundation. In this paper, we consolidate the findings\nand discuss open research directions.\n","authors":["Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2309.02092v2.pdf","comment":"accepted to the Big Picture Workshop\n  (https://bigpictureworkshop.com/)"},{"id":"http://arxiv.org/abs/2310.03414v1","updated":"2023-10-05T09:38:09Z","published":"2023-10-05T09:38:09Z","title":"LLM Based Multi-Document Summarization Exploiting Main-Event Biased\n  Monotone Submodular Content Extraction","summary":"  Multi-document summarization is a challenging task due to its inherent\nsubjective bias, highlighted by the low inter-annotator ROUGE-1 score of 0.4\namong DUC-2004 reference summaries. In this work, we aim to enhance the\nobjectivity of news summarization by focusing on the main event of a group of\nrelated news documents and presenting it coherently with sufficient context.\nOur primary objective is to succinctly report the main event, ensuring that the\nsummary remains objective and informative. To achieve this, we employ an\nextract-rewrite approach that incorporates a main-event biased\nmonotone-submodular function for content selection. This enables us to extract\nthe most crucial information related to the main event from the document\ncluster. To ensure coherence, we utilize a fine-tuned Language Model (LLM) for\nrewriting the extracted content into a coherent text. The evaluation using\nobjective metrics and human evaluators confirms the effectiveness of our\napproach, as it surpasses potential baselines, demonstrating excellence in both\ncontent coverage, coherence, and informativeness.\n","authors":["Litton J Kurisinkel","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05128v2","updated":"2023-10-05T09:12:07Z","published":"2023-04-11T10:43:43Z","title":"Teaching Large Language Models to Self-Debug","summary":"  Large language models (LLMs) have achieved impressive performance on code\ngeneration. However, for complex programming tasks, generating the correct\nsolution in one go becomes challenging, thus some prior works have designed\nprogram repair approaches to improve code generation performance. In this work,\nwe propose Self-Debugging, which teaches a large language model to debug its\npredicted program via few-shot demonstrations. In particular, we demonstrate\nthat Self-Debugging can teach the large language model to perform rubber duck\ndebugging; i.e., without any human feedback on the code correctness or error\nmessages, the model is able to identify its mistakes by investigating the\nexecution results and explaining the generated code in natural language.\nSelf-Debugging achieves the state-of-the-art performance on several code\ngeneration benchmarks, including the Spider dataset for text-to-SQL generation,\nTransCoder for C++-to-Python translation, and MBPP for text-to-Python\ngeneration. On the Spider benchmark where there are no unit tests to verify the\ncorrectness of predictions, Self-Debugging with code explanation consistently\nimproves the baseline by 2-3%, and improves the prediction accuracy on problems\nof the hardest level by 9%. On TransCoder and MBPP where unit tests are\navailable, Self-Debugging improves the baseline accuracy by up to 12%.\nMeanwhile, by leveraging feedback messages and reusing failed predictions,\nSelf-Debugging notably improves sample efficiency, and can match or outperform\nbaseline models that generate more than 10x candidate programs.\n","authors":["Xinyun Chen","Maxwell Lin","Nathanael Schärli","Denny Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.05128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03376v1","updated":"2023-10-05T08:27:33Z","published":"2023-10-05T08:27:33Z","title":"Procedural Text Mining with Large Language Models","summary":"  Recent advancements in the field of Natural Language Processing, particularly\nthe development of large-scale language models that are pretrained on vast\namounts of knowledge, are creating novel opportunities within the realm of\nKnowledge Engineering. In this paper, we investigate the usage of large\nlanguage models (LLMs) in both zero-shot and in-context learning settings to\ntackle the problem of extracting procedures from unstructured PDF text in an\nincremental question-answering fashion. In particular, we leverage the current\nstate-of-the-art GPT-4 (Generative Pre-trained Transformer 4) model,\naccompanied by two variations of in-context learning that involve an ontology\nwith definitions of procedures and steps and a limited number of samples of\nfew-shot learning. The findings highlight both the promise of this approach and\nthe value of the in-context learning customisations. These modifications have\nthe potential to significantly address the challenge of obtaining sufficient\ntraining data, a hurdle often encountered in deep learning-based Natural\nLanguage Processing techniques for procedure extraction.\n","authors":["Anisa Rula","Jennifer D'Souza"],"pdf_url":"https://arxiv.org/pdf/2310.03376v1.pdf","comment":"8 pages, 4 figures, Accepted to The Twelfth International Conference\n  on Knowledge Capture (K-Cap 2023)"},{"id":"http://arxiv.org/abs/2310.03368v1","updated":"2023-10-05T07:57:09Z","published":"2023-10-05T07:57:09Z","title":"Evaluating Hallucinations in Chinese Large Language Models","summary":"  In this paper, we establish a benchmark named HalluQA (Chinese Hallucination\nQuestion-Answering) to measure the hallucination phenomenon in Chinese large\nlanguage models. HalluQA contains 450 meticulously designed adversarial\nquestions, spanning multiple domains, and takes into account Chinese historical\nculture, customs, and social phenomena. During the construction of HalluQA, we\nconsider two types of hallucinations: imitative falsehoods and factual errors,\nand we construct adversarial samples based on GLM-130B and ChatGPT. For\nevaluation, we design an automated evaluation method using GPT-4 to judge\nwhether a model output is hallucinated. We conduct extensive experiments on 24\nlarge language models, including ERNIE-Bot, Baichuan2, ChatGLM, Qwen, SparkDesk\nand etc. Out of the 24 models, 18 achieved non-hallucination rates lower than\n50%. This indicates that HalluQA is highly challenging. We analyze the primary\ntypes of hallucinations in different types of models and their causes.\nAdditionally, we discuss which types of hallucinations should be prioritized\nfor different types of models.\n","authors":["Qinyuan Cheng","Tianxiang Sun","Wenwei Zhang","Siyin Wang","Xiangyang Liu","Mozhi Zhang","Junliang He","Mianqiu Huang","Zhangyue Yin","Kai Chen","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.03368v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.17167v2","updated":"2023-10-05T07:27:12Z","published":"2023-09-29T12:04:14Z","title":"DyVal: Graph-informed Dynamic Evaluation of Large Language Models","summary":"  Large language models (LLMs) have achieved remarkable performance in various\nevaluation benchmarks. However, concerns about their performance are raised on\npotential data contamination in their considerable volume of training corpus.\nMoreover, the static nature and fixed complexity of current benchmarks may\ninadequately gauge the advancing capabilities of LLMs. In this paper, we\nintroduce DyVal, a novel, general, and flexible evaluation protocol for dynamic\nevaluation of LLMs. Based on our proposed dynamic evaluation framework, we\nbuild graph-informed DyVal by leveraging the structural advantage of directed\nacyclic graphs to dynamically generate evaluation samples with controllable\ncomplexities. DyVal generates challenging evaluation sets on reasoning tasks\nincluding mathematics, logical reasoning, and algorithm problems. We evaluate\nvarious LLMs ranging from Flan-T5-large to ChatGPT and GPT4. Experiments\ndemonstrate that LLMs perform worse in DyVal-generated evaluation samples with\ndifferent complexities, emphasizing the significance of dynamic evaluation. We\nalso analyze the failure cases and results of different prompting methods.\nMoreover, DyVal-generated samples are not only evaluation sets, but also\nhelpful data for fine-tuning to improve the performance of LLMs on existing\nbenchmarks. We hope that DyVal can shed light on the future evaluation research\nof LLMs.\n","authors":["Kaijie Zhu","Jiaao Chen","Jindong Wang","Neil Zhenqiang Gong","Diyi Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2309.17167v2.pdf","comment":"Technical report; 36 pages; code will be released at aka.ms/dyval"},{"id":"http://arxiv.org/abs/2305.15070v3","updated":"2023-10-05T07:10:25Z","published":"2023-05-24T11:54:46Z","title":"Annotation Imputation to Individualize Predictions: Initial Studies on\n  Distribution Dynamics and Model Predictions","summary":"  Annotating data via crowdsourcing is time-consuming and expensive. Due to\nthese costs, dataset creators often have each annotator label only a small\nsubset of the data. This leads to sparse datasets with examples that are marked\nby few annotators. The downside of this process is that if an annotator doesn't\nget to label a particular example, their perspective on it is missed. This is\nespecially concerning for subjective NLP datasets where there is no single\ncorrect label: people may have different valid opinions. Thus, we propose using\nimputation methods to generate the opinions of all annotators for all examples,\ncreating a dataset that does not leave out any annotator's view. We then train\nand prompt models, using data from the imputed dataset, to make predictions\nabout the distribution of responses and individual annotations.\n  In our analysis of the results, we found that the choice of imputation method\nsignificantly impacts soft label changes and distribution. While the imputation\nintroduces noise in the prediction of the original dataset, it has shown\npotential in enhancing shots for prompts, particularly for low-response-rate\nannotators. We have made all of our code and data publicly available.\n","authors":["London Lowmanstone","Ruyuan Wan","Risako Owan","Jaehyung Kim","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2305.15070v3.pdf","comment":"NLPerspectives - 2nd Workshop on Perspectivist Approaches to NLP, 39\n  pages, 13 figures, 13 tables"},{"id":"http://arxiv.org/abs/2303.14655v2","updated":"2023-10-05T06:55:13Z","published":"2023-03-26T08:43:36Z","title":"GOAL: A Challenging Knowledge-grounded Video Captioning Benchmark for\n  Real-time Soccer Commentary Generation","summary":"  Despite the recent emergence of video captioning models, how to generate\nvivid, fine-grained video descriptions based on the background knowledge (i.e.,\nlong and informative commentary about the domain-specific scenes with\nappropriate reasoning) is still far from being solved, which however has great\napplications such as automatic sports narrative. In this paper, we present\nGOAL, a benchmark of over 8.9k soccer video clips, 22k sentences, and 42k\nknowledge triples for proposing a challenging new task setting as\nKnowledge-grounded Video Captioning (KGVC). Moreover, we conduct experimental\nadaption of existing methods to show the difficulty and potential directions\nfor solving this valuable and applicable task. Our data and code are available\nat https://github.com/THU-KEG/goal.\n","authors":["Ji Qi","Jifan Yu","Teng Tu","Kunyu Gao","Yifan Xu","Xinyu Guan","Xiaozhi Wang","Yuxiao Dong","Bin Xu","Lei Hou","Juanzi Li","Jie Tang","Weidong Guo","Hui Liu","Yu Xu"],"pdf_url":"https://arxiv.org/pdf/2303.14655v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2310.01889v2","updated":"2023-10-05T06:25:34Z","published":"2023-10-03T08:44:50Z","title":"Ring Attention with Blockwise Transformers for Near-Infinite Context","summary":"  Transformers have emerged as the architecture of choice for many\nstate-of-the-art AI models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands imposed by Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving extended sequences or long-term dependencies. We present a\ndistinct approach, Ring Attention, which leverages blockwise computation of\nself-attention to distribute long sequences across multiple devices while\nconcurrently overlapping the communication of key-value blocks with the\ncomputation of blockwise attention. By processing longer input sequences while\nmaintaining memory efficiency, Ring Attention enables training and inference of\nsequences that are device count times longer than those of prior\nmemory-efficient Transformers, effectively eliminating the memory constraints\nimposed by individual devices. Extensive experiments on language modeling tasks\ndemonstrate the effectiveness of Ring Attention in allowing large sequence\ninput size and improving performance.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.01889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03328v1","updated":"2023-10-05T05:55:06Z","published":"2023-10-05T05:55:06Z","title":"Reformulating Domain Adaptation of Large Language Models as\n  Adapt-Retrieve-Revise","summary":"  While large language models (LLMs) like GPT-4 have recently demonstrated\nastonishing zero-shot capabilities in general domain tasks, they often generate\ncontent with hallucinations in specific domains such as Chinese law, hindering\ntheir application in these areas. This is typically due to the absence of\ntraining data that encompasses such a specific domain, preventing GPT-4 from\nacquiring in-domain knowledge. A pressing challenge is that it's not plausible\nto continue training LLMs of such scale on in-domain data.\n  This paper introduces a simple and effective domain adaptation framework for\nGPT-4 by reformulating generation as an \\textbf{adapt-retrieve-revise} process.\nThe initial step is to \\textbf{adapt} an affordable 7B LLM to the target domain\nby continuing learning on in-domain data. When solving a task, we leverage the\nadapted LLM to generate a draft answer given a task query. Then, the draft\nanswer will be used to \\textbf{retrieve} supporting evidence candidates from an\nexternal in-domain knowledge base. Finally, the draft answer and retrieved\nevidence are concatenated into a whole prompt to let GPT-4 assess the evidence\nand \\textbf{revise} the draft answer to generate the final answer.\n  Our proposal combines the advantages of the efficiency of adapting a smaller\n7B model with the evidence-assessing capability of GPT-4 and effectively\nprevents GPT-4 from generating hallucinatory content. In the zero-shot setting\nof four Chinese legal tasks, our method improves accuracy by 33.3\\% compared to\nthe direct generation by GPT-4. When compared to two stronger retrieval-based\nbaselines, our method outperforms them by 15.4\\% and 23.9\\%. Our code will be\nreleased\n","authors":["Zhen wan","Yating Zhang","Yexiang Wang","Fei Cheng","Sadao Kurohashi"],"pdf_url":"https://arxiv.org/pdf/2310.03328v1.pdf","comment":"Under submission to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.03309v1","updated":"2023-10-05T04:47:49Z","published":"2023-10-05T04:47:49Z","title":"Concise and Organized Perception Facilitates Large Language Models for\n  Deductive Reasoning","summary":"  Exploiting large language models (LLMs) to tackle deductive reasoning has\ngarnered growing attention. It still remains highly challenging to achieve\nsatisfactory results in complex deductive problems, characterized by plenty of\npremises (i.e., facts or rules) entailing intricate relationships among\nentities and requiring multi-hop reasoning. One intuitive solution is to\ndecompose the original task into smaller sub-tasks, and then chain the multiple\ncasual reasoning steps together in a forward (e.g., Selection-Inference) or\nbackward (e.g., LAMBADA) direction. However, these techniques inevitably\nnecessitate a large number of overall stages, leading to computationally\nexpensive operations and a higher possibility of making misleading steps. In\naddition to stage-by-stage decomposition, we draw inspiration from another\naspect of human problem-solving. Humans tend to distill the most relevant\ninformation and organize their thoughts systematically (e.g., creating mind\nmaps), which assists them in answering questions or drawing conclusions\nprecisely and quickly. In light of this, we propose a novel reasoning approach\nnamed Concise and Organized Perception (COP). COP carefully analyzes the given\nstatements to efficiently identify the most pertinent information while\neliminating redundancy. It then prompts the LLMs in a more organized form that\nadapts to the model's inference process. By perceiving concise and organized\nproofs, the deductive reasoning abilities of LLMs can be better elicited, and\nthe risk of acquiring errors caused by excessive reasoning stages is mitigated.\nFurthermore, our approach can be combined with the aforementioned ones to\nfurther boost their performance. Extensive experimental results on three\npopular deductive benchmarks (i.e., ProofWriter, PrOntoQA and PrOntoQA-OOD)\nshow that COP significantly outperforms previous state-of-the-art methods.\n","authors":["Shaotian Yan","Chen Shen","Junjie Liu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2310.03309v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.03304v1","updated":"2023-10-05T04:15:48Z","published":"2023-10-05T04:15:48Z","title":"Learning Personalized Story Evaluation","summary":"  While large language models (LLMs) have shown impressive results for more\nobjective tasks such as QA and retrieval, it remains nontrivial to evaluate\ntheir performance on open-ended text generation for reasons including (1) data\ncontamination; (2) multi-dimensional evaluation criteria; and (3)\nsubjectiveness stemming from reviewers' personal preferences. To address such\nissues, we propose to model personalization in an uncontaminated open-ended\ngeneration assessment. We create two new datasets Per-MPST and Per-DOC for\npersonalized story evaluation, by re-purposing existing datasets with proper\nanonymization and new personalized labels. We further develop a personalized\nstory evaluation model PERSE to infer reviewer preferences and provide a\npersonalized evaluation. Specifically, given a few exemplary reviews from a\nparticular reviewer, PERSE predicts either a detailed review or fine-grained\ncomparison in several aspects (such as interestingness and surprise) for that\nreviewer on a new text input. Experimental results show that PERSE outperforms\nGPT-4 by 15.8% on Kendall correlation of story ratings, and by 13.7% on\npairwise preference prediction accuracy. Both datasets and code will be\nreleased at https://github.com/dqwang122/PerSE.\n","authors":["Danqing Wang","Kevin Yang","Hanlin Zhu","Xiaomeng Yang","Andrew Cohen","Lei Li","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2310.03304v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2306.01879v2","updated":"2023-10-05T04:12:28Z","published":"2023-06-02T19:19:43Z","title":"Revisiting the Role of Language Priors in Vision-Language Models","summary":"  Vision-language models (VLMs) are impactful in part because they can be\napplied to a variety of visual understanding tasks in a zero-shot fashion,\nwithout any fine-tuning. We study $\\textit{generative VLMs}$ that are trained\nfor next-word generation given an image. We explore their zero-shot performance\non the illustrative task of image-text retrieval across 8 popular\nvision-language benchmarks. Our first observation is that they can be\nrepurposed for discriminative tasks (such as image-text retrieval) by simply\ncomputing the match score of generating a particular text string given an\nimage. We call this probabilistic score the $\\textit{Visual Generative\nPre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces\nnear-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on\nothers. We analyze this behavior through a probabilistic lens, pointing out\nthat some benchmarks inadvertently capture unnatural language distributions by\ncreating adversarial but unlikely text captions. In fact, we demonstrate that\neven a \"blind\" language model that ignores any image evidence can sometimes\noutperform all prior art, reminiscent of similar challenges faced by the\nvisual-question answering (VQA) community many years ago. We derive a\nprobabilistic post-processing scheme that controls for the amount of linguistic\nbias in generative VLMs at test time without having to retrain or fine-tune the\nmodel. We show that the VisualGPTScore, when appropriately debiased, is a\nstrong zero-shot baseline for vision-language understanding, oftentimes\nproducing state-of-the-art accuracy.\n","authors":["Zhiqiu Lin","Xinyue Chen","Deepak Pathak","Pengchuan Zhang","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2306.01879v2.pdf","comment":"Website: https://linzhiqiu.github.io/papers/visual_gpt_score/ Code:\n  https://github.com/linzhiqiu/visual_gpt_score/"},{"id":"http://arxiv.org/abs/2110.14883v3","updated":"2023-10-05T04:09:09Z","published":"2021-10-28T04:45:55Z","title":"Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel\n  Training","summary":"  The success of Transformer models has pushed the deep learning model scale to\nbillions of parameters. Due to the limited memory resource of a single GPU,\nHowever, the best practice for choosing the optimal parallel strategy is still\nlacking, since it requires domain expertise in both deep learning and parallel\ncomputing.\n  The Colossal-AI system addressed the above challenge by introducing a unified\ninterface to scale your sequential code of model training to distributed\nenvironments. It supports parallel training methods such as data, pipeline,\ntensor, and sequence parallelism, as well as heterogeneous training methods\nintegrated with zero redundancy optimizer. Compared to the baseline system,\nColossal-AI can achieve up to 2.76 times training speedup on large-scale\nmodels.\n","authors":["Shenggui Li","Hongxin Liu","Zhengda Bian","Jiarui Fang","Haichen Huang","Yuliang Liu","Boxiang Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2110.14883v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03293v1","updated":"2023-10-05T03:45:54Z","published":"2023-10-05T03:45:54Z","title":"A New Dialogue Response Generation Agent for Large Language Models by\n  Asking Questions to Detect User's Intentions","summary":"  Large Language Models (LLMs), such as ChatGPT, have recently been applied to\nvarious NLP tasks due to its open-domain generation capabilities. However,\nthere are two issues with applying LLMs to dialogue tasks. 1. During the\ndialogue process, users may have implicit intentions that might be overlooked\nby LLMs. Consequently, generated responses couldn't align with the user's\nintentions. 2. It is unlikely for LLMs to encompass all fields comprehensively.\nIn certain specific domains, their knowledge may be incomplete, and LLMs cannot\nupdate the latest knowledge in real-time. To tackle these issues, we propose a\nframework~\\emph{using LLM to \\textbf{E}nhance dialogue response generation by\nasking questions to \\textbf{D}etect user's \\textbf{I}mplicit\nin\\textbf{T}entions} (\\textbf{EDIT}). Firstly, EDIT generates open questions\nrelated to the dialogue context as the potential user's intention; Then, EDIT\nanswers those questions by interacting with LLMs and searching in\ndomain-specific knowledge bases respectively, and use LLMs to choose the proper\nanswers to questions as extra knowledge; Finally, EDIT enhances response\ngeneration by explicitly integrating those extra knowledge. Besides, previous\nquestion generation works only focus on asking questions with answers in\ncontext. In order to ask open questions, we construct a Context-Open-Question\n(COQ) dataset. On two task-oriented dialogue tasks (Wizard of Wikipedia and\nHoll-E), EDIT outperformed other LLMs.\n","authors":["Siwei Wu","Xiangqing Shen","Rui Xia"],"pdf_url":"https://arxiv.org/pdf/2310.03293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03283v1","updated":"2023-10-05T03:20:41Z","published":"2023-10-05T03:20:41Z","title":"A Formalism and Approach for Improving Robustness of Large Language\n  Models Using Risk-Adjusted Confidence Scores","summary":"  Large Language Models (LLMs), such as ChatGPT, have achieved impressive\nmilestones in natural language processing (NLP). Despite their impressive\nperformance, the models are known to pose important risks. As these models are\ndeployed in real-world applications, a systematic understanding of different\nrisks posed by these models on tasks such as natural language inference (NLI),\nis much needed. In this paper, we define and formalize two distinct types of\nrisk: decision risk and composite risk. We also propose a risk-centric\nevaluation framework, and four novel metrics, for assessing LLMs on these risks\nin both in-domain and out-of-domain settings. Finally, we propose a\nrisk-adjusted calibration method called DwD for helping LLMs minimize these\nrisks in an overall NLI architecture. Detailed experiments, using four NLI\nbenchmarks, three baselines and two LLMs, including ChatGPT, show both the\npractical utility of the evaluation framework, and the efficacy of DwD in\nreducing decision and composite risk. For instance, when using DwD, an\nunderlying LLM is able to address an extra 20.1% of low-risk inference tasks\n(but which the LLM erroneously deems high-risk without risk adjustment) and\nskip a further 19.8% of high-risk tasks, which would have been answered\nincorrectly.\n","authors":["Ke Shen","Mayank Kejriwal"],"pdf_url":"https://arxiv.org/pdf/2310.03283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00901v2","updated":"2023-10-05T03:04:12Z","published":"2023-10-02T04:42:53Z","title":"TADIS: Steering Models for Deep-Thinking about Demonstration Examples","summary":"  Instruction tuning has been demonstrated that could significantly improve the\nzero-shot generalization capability to unseen tasks by an apparent margin. By\nincorporating additional context (e.g., task definition, examples) during the\nfine-tuning process, Large Language Models (LLMs) achieved much higher\nperformance than before. However, recent work reported that delusive task\nexamples can achieve almost the same performance as correct task examples,\nindicating the input-label correspondence is less important than previously\nthought. Intrigued by this counter-intuitive observation, we suspect models\nhave the same illusion of competence as humans. Therefore, we propose a novel\nmethod called TADIS that steers LLMs for \"Deep-Thinking'' about demonstration\nexamples instead of merely seeing. To alleviate the illusion of competence of\nmodels, we first ask the model to verify the correctness of shown examples.\nThen, using the verification results as conditions to elicit models for a\nbetter answer. Our experimental results show that TADIS consistently\noutperforms competitive baselines on in-domain and out-domain tasks (improving\n2.79 and 4.03 average ROUGLE-L on out-domain and in-domain datasets,\nrespectively). Despite the presence of generated examples (not all of the\nthinking labels are accurate), TADIS can notably enhance performance in\nzero-shot and few-shot settings. This also suggests that our approach can be\nadopted on a large scale to improve the instruction following capabilities of\nmodels without any manual labor. Moreover, we construct three types of thinking\nlabels with different model sizes and find that small models learn from the\nformat of TADIS but larger models can be steered for \"Deep-Thinking''.\n","authors":["Tianci Xue","Ziqi Wang","Yixia Li","Yun Chen","Guanhua Chen"],"pdf_url":"https://arxiv.org/pdf/2310.00901v2.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.11981v3","updated":"2023-10-05T02:58:52Z","published":"2023-09-21T11:34:52Z","title":"Rethinking the Evaluating Framework for Natural Language Understanding\n  in AI Systems: Language Acquisition as a Core for Future Metrics","summary":"  In the burgeoning field of artificial intelligence (AI), the unprecedented\nprogress of large language models (LLMs) in natural language processing (NLP)\noffers an opportunity to revisit the entire approach of traditional metrics of\nmachine intelligence, both in form and content. As the realm of machine\ncognitive evaluation has already reached Imitation, the next step is an\nefficient Language Acquisition and Understanding. Our paper proposes a paradigm\nshift from the established Turing Test towards an all-embracing framework that\nhinges on language acquisition, taking inspiration from the recent advancements\nin LLMs. The present contribution is deeply tributary of the excellent work\nfrom various disciplines, point out the need to keep interdisciplinary bridges\nopen, and delineates a more robust and sustainable approach.\n","authors":["Patricio Vera","Pedro Moya","Lisa Barraza"],"pdf_url":"https://arxiv.org/pdf/2309.11981v3.pdf","comment":"14 pages, 1 table, 2 figures"},{"id":"http://arxiv.org/abs/2309.12871v2","updated":"2023-10-05T02:53:29Z","published":"2023-09-22T13:52:42Z","title":"AnglE-optimized Text Embeddings","summary":"  High-quality text embedding is pivotal in improving semantic textual\nsimilarity (STS) tasks, which are crucial components in Large Language Model\n(LLM) applications. However, a common challenge existing text embedding models\nface is the problem of vanishing gradients, primarily due to their reliance on\nthe cosine function in the optimization objective, which has saturation zones.\nTo address this issue, this paper proposes a novel angle-optimized text\nembedding model called AnglE. The core idea of AnglE is to introduce angle\noptimization in a complex space. This novel approach effectively mitigates the\nadverse effects of the saturation zone in the cosine function, which can impede\ngradient and hinder optimization processes. To set up a comprehensive STS\nevaluation, we experimented on existing short-text STS datasets and a newly\ncollected long-text STS dataset from GitHub Issues. Furthermore, we examine\ndomain-specific STS scenarios with limited labeled data and explore how AnglE\nworks with LLM-annotated data. Extensive experiments were conducted on various\ntasks including short-text STS, long-text STS, and domain-specific STS tasks.\nThe results show that AnglE outperforms the state-of-the-art (SOTA) STS models\nthat ignore the cosine saturation zone. These findings demonstrate the ability\nof AnglE to generate high-quality text embeddings and the usefulness of angle\noptimization in STS.\n","authors":["Xianming Li","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2309.12871v2.pdf","comment":"NLP, Text Embedding, Semantic Textual Similarity"},{"id":"http://arxiv.org/abs/2310.03269v1","updated":"2023-10-05T02:45:39Z","published":"2023-10-05T02:45:39Z","title":"InstructProtein: Aligning Human and Protein Language via Knowledge\n  Instruction","summary":"  Large Language Models (LLMs) have revolutionized the field of natural\nlanguage processing, but they fall short in comprehending biological sequences\nsuch as proteins. To address this challenge, we propose InstructProtein, an\ninnovative LLM that possesses bidirectional generation capabilities in both\nhuman and protein languages: (i) taking a protein sequence as input to predict\nits textual function description and (ii) using natural language to prompt\nprotein sequence generation. To achieve this, we first pre-train an LLM on both\nprotein and natural language corpora, enabling it to comprehend individual\nlanguages. Then supervised instruction tuning is employed to facilitate the\nalignment of these two distinct languages. Herein, we introduce a knowledge\ngraph-based instruction generation framework to construct a high-quality\ninstruction dataset, addressing annotation imbalance and instruction deficits\nin existing protein-text corpus. In particular, the instructions inherit the\nstructural relations between proteins and function annotations in knowledge\ngraphs, which empowers our model to engage in the causal modeling of protein\nfunctions, akin to the chain-of-thought processes in natural languages.\nExtensive experiments on bidirectional protein-text generation tasks show that\nInstructProtein outperforms state-of-the-art LLMs by large margins. Moreover,\nInstructProtein serves as a pioneering step towards text-based protein function\nprediction and sequence design, effectively bridging the gap between protein\nand human language understanding.\n","authors":["Zeyuan Wang","Qiang Zhang","Keyan Ding","Ming Qin","Xiang Zhuang","Xiaotong Li","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03262v1","updated":"2023-10-05T02:35:00Z","published":"2023-10-05T02:35:00Z","title":"Unlock Predictable Scaling from Emergent Abilities","summary":"  The scientific scale-up of large language models (LLMs) necessitates a\ncomprehensive understanding of their scaling properties. However, the existing\nliterature on the scaling properties only yields an incomplete answer:\noptimization loss decreases predictably as the model size increases, in line\nwith established scaling law; yet no scaling law for task has been established\nand the task performances are far from predictable during scaling. Task\nperformances typically show minor gains on small models until they improve\ndramatically once models exceed a size threshold, exemplifying the ``emergent\nabilities''. In this study, we discover that small models, although they\nexhibit minor performance, demonstrate critical and consistent task performance\nimprovements that are not captured by conventional evaluation strategies due to\ninsufficient measurement resolution. To measure such improvements, we introduce\nPassUntil, an evaluation strategy through massive sampling in the decoding\nphase. We conduct quantitative investigations into the scaling law of task\nperformance. Firstly, a strict task scaling law is identified, enhancing the\npredictability of task performances. Remarkably, we are able to predict the\nperformance of the 2.4B model on code generation with merely 0.05\\% deviation\nbefore training starts. Secondly, underpinned by PassUntil, we observe concrete\nevidence of emergent abilities and ascertain that they are not in conflict with\nthe continuity of performance improvement. Their semblance to break-through is\nthat their scaling curve cannot be fitted by standard scaling law function. We\nthen introduce a mathematical definition for the emergent abilities. Through\nthe definition, we refute a prevalent ``multi-step reasoning hypothesis''\nregarding the genesis of emergent abilities and propose a new hypothesis with a\nsatisfying fit to the observed scaling curve.\n","authors":["Shengding Hu","Xin Liu","Xu Han","Xinrong Zhang","Chaoqun He","Weilin Zhao","Yankai Lin","Ning Ding","Zebin Ou","Guoyang Zeng","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03262v1.pdf","comment":"10 pages main paper, 8 pages appendix"},{"id":"http://arxiv.org/abs/2305.13673v2","updated":"2023-10-05T01:43:23Z","published":"2023-05-23T04:28:16Z","title":"Physics of Language Models: Part 1, Context-Free Grammar","summary":"  We design controlled experiments to study HOW generative language models,\nlike GPT, learn context-free grammars (CFGs) -- diverse language systems with a\ntree-like structure capturing many aspects of natural languages, programs, and\nlogics. CFGs are as hard as pushdown automata, and can be ambiguous so that\nverifying if a string satisfies the rules requires dynamic programming. We\nconstruct synthetic data and demonstrate that even for difficult (long and\nambiguous) CFGs, pre-trained transformers can learn to generate sentences with\nnear-perfect accuracy and impressive diversity.\n  More importantly, we delve into the physical principles behind how\ntransformers learns CFGs. We discover that the hidden states within the\ntransformer implicitly and precisely encode the CFG structure (such as putting\ntree node information exactly on the subtree boundary), and learn to form\n\"boundary to boundary\" attentions resembling dynamic programming. We also cover\nsome extension of CFGs as well as the robustness aspect of transformers against\ngrammar mistakes. Overall, our research provides a comprehensive and empirical\nunderstanding of how transformers learn CFGs, and reveals the physical\nmechanisms utilized by transformers to capture the structure and rules of\nlanguages.\n","authors":["Zeyuan Allen-Zhu","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2305.13673v2.pdf","comment":"V2 polishes writing and adds Appendix G"},{"id":"http://arxiv.org/abs/2310.03249v1","updated":"2023-10-05T01:42:16Z","published":"2023-10-05T01:42:16Z","title":"Can Large Language Models be Good Path Planners? A Benchmark and\n  Investigation on Spatial-temporal Reasoning","summary":"  Large language models (LLMs) have achieved remarkable success across a wide\nspectrum of tasks; however, they still face limitations in scenarios that\ndemand long-term planning and spatial reasoning. To facilitate this line of\nresearch, in this work, we propose a new benchmark, termed $\\textbf{P}$ath\n$\\textbf{P}$lanning from $\\textbf{N}$atural $\\textbf{L}$anguage\n($\\textbf{PPNL}$). Our benchmark evaluates LLMs' spatial-temporal reasoning by\nformulating ''path planning'' tasks that require an LLM to navigate to target\nlocations while avoiding obstacles and adhering to constraints. Leveraging this\nbenchmark, we systematically investigate LLMs including GPT-4 via different\nfew-shot prompting methodologies and BART and T5 of various sizes via\nfine-tuning. Our experimental results show the promise of few-shot GPT-4 in\nspatial reasoning, when it is prompted to reason and act interleavedly,\nalthough it still fails to make long-term temporal reasoning. In contrast,\nwhile fine-tuned LLMs achieved impressive results on in-distribution reasoning\ntasks, they struggled to generalize to larger environments or environments with\nmore obstacles.\n","authors":["Mohamed Aghzal","Erion Plaku","Ziyu Yao"],"pdf_url":"https://arxiv.org/pdf/2310.03249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03232v1","updated":"2023-10-05T00:55:46Z","published":"2023-10-05T00:55:46Z","title":"Deep Representations of First-person Pronouns for Prediction of\n  Depression Symptom Severity","summary":"  Prior work has shown that analyzing the use of first-person singular pronouns\ncan provide insight into individuals' mental status, especially depression\nsymptom severity. These findings were generated by counting frequencies of\nfirst-person singular pronouns in text data. However, counting doesn't capture\nhow these pronouns are used. Recent advances in neural language modeling have\nleveraged methods generating contextual embeddings. In this study, we sought to\nutilize the embeddings of first-person pronouns obtained from contextualized\nlanguage representation models to capture ways these pronouns are used, to\nanalyze mental status. De-identified text messages sent during online\npsychotherapy with weekly assessment of depression severity were used for\nevaluation. Results indicate the advantage of contextualized first-person\npronoun embeddings over standard classification token embeddings and\nfrequency-based pronoun analysis results in predicting depression symptom\nseverity. This suggests contextual representations of first-person pronouns can\nenhance the predictive utility of language used by people with depression\nsymptoms.\n","authors":["Xinyang Ren","Hannah A Burkhardt","Patricia A Areán","Thomas D Hull","Trevor Cohen"],"pdf_url":"https://arxiv.org/pdf/2310.03232v1.pdf","comment":"Accepted: AMIA Annual Symposium 2023. To appear as: Ren X, Burkhardt\n  H, Are\\'an P, Hull T, Cohen T. Deep Representations of First-person Pronouns\n  for Prediction of Depression Symptom Severity. AMIA Annual Symposium\n  Proceedings 2023. American Medical Informatics Association"},{"id":"http://arxiv.org/abs/2310.03214v1","updated":"2023-10-05T00:04:12Z","published":"2023-10-05T00:04:12Z","title":"FreshLLMs: Refreshing Large Language Models with Search Engine\n  Augmentation","summary":"  Most large language models (LLMs) are trained once and never updated; thus,\nthey lack the ability to dynamically adapt to our ever-changing world. In this\nwork, we perform a detailed study of the factuality of LLM-generated text in\nthe context of answering questions that test current world knowledge.\nSpecifically, we introduce FreshQA, a novel dynamic QA benchmark encompassing a\ndiverse range of question and answer types, including questions that require\nfast-changing world knowledge as well as questions with false premises that\nneed to be debunked. We benchmark a diverse array of both closed and\nopen-source LLMs under a two-mode evaluation procedure that allows us to\nmeasure both correctness and hallucination. Through human evaluations involving\nmore than 50K judgments, we shed light on limitations of these models and\ndemonstrate significant room for improvement: for instance, all models\n(regardless of model size) struggle on questions that involve fast-changing\nknowledge and false premises. Motivated by these results, we present\nFreshPrompt, a simple few-shot prompting method that substantially boosts the\nperformance of an LLM on FreshQA by incorporating relevant and up-to-date\ninformation retrieved from a search engine into the prompt. Our experiments\nshow that FreshPrompt outperforms both competing search engine-augmented\nprompting methods such as Self-Ask (Press et al., 2022) as well as commercial\nsystems such as Perplexity.AI. Further analysis of FreshPrompt reveals that\nboth the number of retrieved evidences and their order play a key role in\ninfluencing the correctness of LLM-generated answers. Additionally, instructing\nthe LLM to generate concise and direct answers helps reduce hallucination\ncompared to encouraging more verbose answers. To facilitate future work, we\nrelease FreshQA at github.com/freshllms/freshqa and commit to updating it at\nregular intervals.\n","authors":["Tu Vu","Mohit Iyyer","Xuezhi Wang","Noah Constant","Jerry Wei","Jason Wei","Chris Tar","Yun-Hsuan Sung","Denny Zhou","Quoc Le","Thang Luong"],"pdf_url":"https://arxiv.org/pdf/2310.03214v1.pdf","comment":"Preprint, 22 pages, 7 figures, 5 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.03744v1","updated":"2023-10-05T17:59:56Z","published":"2023-10-05T17:59:56Z","title":"Improved Baselines with Visual Instruction Tuning","summary":"  Large multimodal models (LMM) have recently shown encouraging progress with\nvisual instruction tuning. In this note, we show that the fully-connected\nvision-language cross-modal connector in LLaVA is surprisingly powerful and\ndata-efficient. With simple modifications to LLaVA, namely, using\nCLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA\ndata with simple response formatting prompts, we establish stronger baselines\nthat achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint\nuses merely 1.2M publicly available data, and finishes full training in ~1 day\non a single 8-A100 node. We hope this can make state-of-the-art LMM research\nmore accessible. Code and model will be publicly available.\n","authors":["Haotian Liu","Chunyuan Li","Yuheng Li","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2310.03744v1.pdf","comment":"Tech report, 4 pages. LLaVA project page: https://llava-vl.github.io"},{"id":"http://arxiv.org/abs/2310.03740v1","updated":"2023-10-05T17:59:45Z","published":"2023-10-05T17:59:45Z","title":"ContactGen: Generative Contact Modeling for Grasp Generation","summary":"  This paper presents a novel object-centric contact representation ContactGen\nfor hand-object interaction. The ContactGen comprises three components: a\ncontact map indicates the contact location, a part map represents the contact\nhand part, and a direction map tells the contact direction within each part.\nGiven an input object, we propose a conditional generative model to predict\nContactGen and adopt model-based optimization to predict diverse and\ngeometrically feasible grasps. Experimental results demonstrate our method can\ngenerate high-fidelity and diverse human grasps for various objects. Project\npage: https://stevenlsw.github.io/contactgen/\n","authors":["Shaowei Liu","Yang Zhou","Jimei Yang","Saurabh Gupta","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03740v1.pdf","comment":"Accepted to ICCV 2023. Website:\n  https://stevenlsw.github.io/contactgen/"},{"id":"http://arxiv.org/abs/2310.03739v1","updated":"2023-10-05T17:59:18Z","published":"2023-10-05T17:59:18Z","title":"Aligning Text-to-Image Diffusion Models with Reward Backpropagation","summary":"  Text-to-image diffusion models have recently emerged at the forefront of\nimage generation, powered by very large-scale unsupervised or weakly supervised\ntext-to-image training datasets. Due to their unsupervised training,\ncontrolling their behavior in downstream tasks, such as maximizing\nhuman-perceived image quality, image-text alignment, or ethical image\ngeneration, is difficult. Recent works finetune diffusion models to downstream\nreward functions using vanilla reinforcement learning, notorious for the high\nvariance of the gradient estimators. In this paper, we propose AlignProp, a\nmethod that aligns diffusion models to downstream reward functions using\nend-to-end backpropagation of the reward gradient through the denoising\nprocess. While naive implementation of such backpropagation would require\nprohibitive memory resources for storing the partial derivatives of modern\ntext-to-image models, AlignProp finetunes low-rank adapter weight modules and\nuses gradient checkpointing, to render its memory usage viable. We test\nAlignProp in finetuning diffusion models to various objectives, such as\nimage-text semantic alignment, aesthetics, compressibility and controllability\nof the number of objects present, as well as their combinations. We show\nAlignProp achieves higher rewards in fewer training steps than alternatives,\nwhile being conceptually simpler, making it a straightforward choice for\noptimizing diffusion models for differentiable reward functions of interest.\nCode and Visualization results are available at https://align-prop.github.io/.\n","authors":["Mihir Prabhudesai","Anirudh Goyal","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2310.03739v1.pdf","comment":"Code available at https://align-prop.github.io/"},{"id":"http://arxiv.org/abs/2212.02648v2","updated":"2023-10-05T17:59:06Z","published":"2022-12-05T23:15:43Z","title":"Spuriosity Rankings: Sorting Data to Measure and Mitigate Biases","summary":"  We present a simple but effective method to measure and mitigate model biases\ncaused by reliance on spurious cues. Instead of requiring costly changes to\none's data or model training, our method better utilizes the data one already\nhas by sorting them. Specifically, we rank images within their classes based on\nspuriosity (the degree to which common spurious cues are present), proxied via\ndeep neural features of an interpretable network. With spuriosity rankings, it\nis easy to identify minority subpopulations (i.e. low spuriosity images) and\nassess model bias as the gap in accuracy between high and low spuriosity\nimages. One can even efficiently remove a model's bias at little cost to\naccuracy by finetuning its classification head on low spuriosity images,\nresulting in fairer treatment of samples regardless of spuriosity. We\ndemonstrate our method on ImageNet, annotating $5000$ class-feature\ndependencies ($630$ of which we find to be spurious) and generating a dataset\nof $325k$ soft segmentations for these features along the way. Having computed\nspuriosity rankings via the identified spurious neural features, we assess\nbiases for $89$ diverse models and find that class-wise biases are highly\ncorrelated across models. Our results suggest that model bias due to spurious\nfeature reliance is influenced far more by what the model is trained on than\nhow it is trained.\n","authors":["Mazda Moayeri","Wenxiao Wang","Sahil Singla","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2212.02648v2.pdf","comment":"Accepted to NeurIPS '23 (Spotlight)"},{"id":"http://arxiv.org/abs/2310.03738v1","updated":"2023-10-05T17:58:32Z","published":"2023-10-05T17:58:32Z","title":"Stylist: Style-Driven Feature Ranking for Robust Novelty Detection","summary":"  Novelty detection aims at finding samples that differ in some form from the\ndistribution of seen samples. But not all changes are created equal. Data can\nsuffer a multitude of distribution shifts, and we might want to detect only\nsome types of relevant changes. Similar to works in out-of-distribution\ngeneralization, we propose to use the formalization of separating into semantic\nor content changes, that are relevant to our task, and style changes, that are\nirrelevant. Within this formalization, we define the robust novelty detection\nas the task of finding semantic changes while being robust to style\ndistributional shifts. Leveraging pretrained, large-scale model\nrepresentations, we introduce Stylist, a novel method that focuses on dropping\nenvironment-biased features. First, we compute a per-feature score based on the\nfeature distribution distances between environments. Next, we show that our\nselection manages to remove features responsible for spurious correlations and\nimprove novelty detection performance. For evaluation, we adapt domain\ngeneralization datasets to our task and analyze the methods behaviors. We\nadditionally built a large synthetic dataset where we have control over the\nspurious correlations degree. We prove that our selection mechanism improves\nnovelty detection algorithms across multiple datasets, containing both\nstylistic and content shifts.\n","authors":["Stefan Smeu","Elena Burceanu","Emanuela Haller","Andrei Liviu Nicolicioiu"],"pdf_url":"https://arxiv.org/pdf/2310.03738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03734v1","updated":"2023-10-05T17:55:19Z","published":"2023-10-05T17:55:19Z","title":"Leveraging Unpaired Data for Vision-Language Generative Models via Cycle\n  Consistency","summary":"  Current vision-language generative models rely on expansive corpora of paired\nimage-text data to attain optimal performance and generalization capabilities.\nHowever, automatically collecting such data (e.g. via large-scale web scraping)\nleads to low quality and poor image-text correlation, while human annotation is\nmore accurate but requires significant manual effort and expense. We introduce\n$\\textbf{ITIT}$ ($\\textbf{I}$n$\\textbf{T}$egrating $\\textbf{I}$mage\n$\\textbf{T}$ext): an innovative training paradigm grounded in the concept of\ncycle consistency which allows vision-language training on unpaired image and\ntext data. ITIT is comprised of a joint image-text encoder with disjoint image\nand text decoders that enable bidirectional image-to-text and text-to-image\ngeneration in a single framework. During training, ITIT leverages a small set\nof paired image-text data to ensure its output matches the input reasonably\nwell in both directions. Simultaneously, the model is also trained on much\nlarger datasets containing only images or texts. This is achieved by enforcing\ncycle consistency between the original unpaired samples and the cycle-generated\ncounterparts. For instance, it generates a caption for a given input image and\nthen uses the caption to create an output image, and enforces similarity\nbetween the input and output images. Our experiments show that ITIT with\nunpaired datasets exhibits similar scaling behavior as using high-quality\npaired data. We demonstrate image generation and captioning performance on par\nwith state-of-the-art text-to-image and image-to-text models with orders of\nmagnitude fewer (only 3M) paired image-text data.\n","authors":["Tianhong Li","Sangnie Bhardwaj","Yonglong Tian","Han Zhang","Jarred Barber","Dina Katabi","Guillaume Lajoie","Huiwen Chang","Dilip Krishnan"],"pdf_url":"https://arxiv.org/pdf/2310.03734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03731v1","updated":"2023-10-05T17:52:09Z","published":"2023-10-05T17:52:09Z","title":"MathCoder: Seamless Code Integration in LLMs for Enhanced Mathematical\n  Reasoning","summary":"  The recently released GPT-4 Code Interpreter has demonstrated remarkable\nproficiency in solving challenging math problems, primarily attributed to its\nability to seamlessly reason with natural language, generate code, execute\ncode, and continue reasoning based on the execution output. In this paper, we\npresent a method to fine-tune open-source language models, enabling them to use\ncode for modeling and deriving math equations and, consequently, enhancing\ntheir mathematical reasoning abilities. We propose a method of generating novel\nand high-quality datasets with math problems and their code-based solutions,\nreferred to as MathCodeInstruct. Each solution interleaves natural language,\ncode, and execution results. We also introduce a customized supervised\nfine-tuning and inference approach. This approach yields the MathCoder models,\na family of models capable of generating code-based solutions for solving\nchallenging math problems. Impressively, the MathCoder models achieve\nstate-of-the-art scores among open-source LLMs on the MATH (45.2%) and GSM8K\n(83.9%) datasets, substantially outperforming other open-source alternatives.\nNotably, the MathCoder model not only surpasses ChatGPT-3.5 and PaLM-2 on GSM8K\nand MATH but also outperforms GPT-4 on the competition-level MATH dataset. The\ndataset and models will be released at https://github.com/mathllm/MathCoder.\n","authors":["Ke Wang","Houxing Ren","Aojun Zhou","Zimu Lu","Sichun Luo","Weikang Shi","Renrui Zhang","Linqi Song","Mingjie Zhan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2310.03731v1.pdf","comment":"The state-of-the-art open-source language models for mathematical\n  reasoning"},{"id":"http://arxiv.org/abs/2308.12960v2","updated":"2023-10-05T17:46:18Z","published":"2023-08-24T17:56:46Z","title":"Towards Realistic Zero-Shot Classification via Self Structural Semantic\n  Alignment","summary":"  Large-scale pre-trained Vision Language Models (VLMs) have proven effective\nfor zero-shot classification. Despite the success, most traditional VLMs-based\nmethods are restricted by the assumption of partial source supervision or ideal\nvocabularies, which rarely satisfy the open-world scenario. In this paper, we\naim at a more challenging setting, Realistic Zero-Shot Classification, which\nassumes no annotation but instead a broad vocabulary. To address this\nchallenge, we propose the Self Structural Semantic Alignment (S^3A) framework,\nwhich extracts the structural semantic information from unlabeled data while\nsimultaneously self-learning. Our S^3A framework adopts a unique\nCluster-Vote-Prompt-Realign (CVPR) algorithm, which iteratively groups\nunlabeled data to derive structural semantics for pseudo-supervision. Our CVPR\nprocess includes iterative clustering on images, voting within each cluster to\nidentify initial class candidates from the vocabulary, generating\ndiscriminative prompts with large language models to discern confusing\ncandidates, and realigning images and the vocabulary as structural semantic\nalignment. Finally, we propose to self-learn the CLIP image encoder with both\nindividual and structural semantic alignment through a teacher-student learning\nstrategy. Our comprehensive experiments across various generic and fine-grained\nbenchmarks demonstrate that the S^3A method offers substantial improvements\nover existing VLMs-based approaches, achieving a more than 15% accuracy\nimprovement over CLIP on average. Our codes, models, and prompts are publicly\nreleased at https://github.com/sheng-eatamath/S3A.\n","authors":["Sheng Zhang","Muzammal Naseer","Guangyi Chen","Zhiqiang Shen","Salman Khan","Kun Zhang","Fahad Khan"],"pdf_url":"https://arxiv.org/pdf/2308.12960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03707v1","updated":"2023-10-05T17:34:47Z","published":"2023-10-05T17:34:47Z","title":"OMG-ATTACK: Self-Supervised On-Manifold Generation of Transferable\n  Evasion Attacks","summary":"  Evasion Attacks (EA) are used to test the robustness of trained neural\nnetworks by distorting input data to misguide the model into incorrect\nclassifications. Creating these attacks is a challenging task, especially with\nthe ever-increasing complexity of models and datasets. In this work, we\nintroduce a self-supervised, computationally economical method for generating\nadversarial examples, designed for the unseen black-box setting. Adapting\ntechniques from representation learning, our method generates on-manifold EAs\nthat are encouraged to resemble the data distribution. These attacks are\ncomparable in effectiveness compared to the state-of-the-art when attacking the\nmodel trained on, but are significantly more effective when attacking unseen\nmodels, as the attacks are more related to the data rather than the model\nitself. Our experiments consistently demonstrate the method is effective across\nvarious models, unseen data categories, and even defended models, suggesting a\nsignificant role for on-manifold EAs when targeting unseen models.\n","authors":["Ofir Bar Tal","Adi Haviv","Amit H. Bermano"],"pdf_url":"https://arxiv.org/pdf/2310.03707v1.pdf","comment":"ICCV 2023, AROW Workshop"},{"id":"http://arxiv.org/abs/2303.16887v2","updated":"2023-10-05T17:32:26Z","published":"2023-03-29T17:56:36Z","title":"Towards Understanding the Effect of Pretraining Label Granularity","summary":"  In this paper, we study how the granularity of pretraining labels affects the\ngeneralization of deep neural networks in image classification tasks. We focus\non the \"fine-to-coarse\" transfer learning setting, where the pretraining label\nspace is more fine-grained than that of the target problem. Empirically, we\nshow that pretraining on the leaf labels of ImageNet21k produces better\ntransfer results on ImageNet1k than pretraining on other coarser granularity\nlevels, which supports the common practice used in the community.\nTheoretically, we explain the benefit of fine-grained pretraining by proving\nthat, for a data distribution satisfying certain hierarchy conditions, 1)\ncoarse-grained pretraining only allows a neural network to learn the \"common\"\nor \"easy-to-learn\" features well, while 2) fine-grained pretraining helps the\nnetwork learn the \"rarer\" or \"fine-grained\" features in addition to the common\nones, thus improving its accuracy on hard downstream test samples in which\ncommon features are missing or weak in strength. Furthermore, we perform\ncomprehensive experiments using the label hierarchies of iNaturalist 2021 and\nobserve that the following conditions, in addition to proper choice of label\ngranularity, enable the transfer to work well in practice: 1) the pretraining\ndataset needs to have a meaningful label hierarchy, and 2) the pretraining and\ntarget label functions need to align well.\n","authors":["Guan Zhe Hong","Yin Cui","Ariel Fuxman","Stanley H. Chan","Enming Luo"],"pdf_url":"https://arxiv.org/pdf/2303.16887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03704v1","updated":"2023-10-05T17:24:36Z","published":"2023-10-05T17:24:36Z","title":"Drag View: Generalizable Novel View Synthesis with Unposed Imagery","summary":"  We introduce DragView, a novel and interactive framework for generating novel\nviews of unseen scenes. DragView initializes the new view from a single source\nimage, and the rendering is supported by a sparse set of unposed multi-view\nimages, all seamlessly executed within a single feed-forward pass. Our approach\nbegins with users dragging a source view through a local relative coordinate\nsystem. Pixel-aligned features are obtained by projecting the sampled 3D points\nalong the target ray onto the source view. We then incorporate a view-dependent\nmodulation layer to effectively handle occlusion during the projection.\nAdditionally, we broaden the epipolar attention mechanism to encompass all\nsource pixels, facilitating the aggregation of initialized coordinate-aligned\npoint features from other unposed views. Finally, we employ another transformer\nto decode ray features into final pixel intensities. Crucially, our framework\ndoes not rely on either 2D prior models or the explicit estimation of camera\nposes. During testing, DragView showcases the capability to generalize to new\nscenes unseen during training, also utilizing only unposed support images,\nenabling the generation of photo-realistic new views characterized by flexible\ncamera trajectories. In our experiments, we conduct a comprehensive comparison\nof the performance of DragView with recent scene representation networks\noperating under pose-free conditions, as well as with generalizable NeRFs\nsubject to noisy test camera poses. DragView consistently demonstrates its\nsuperior performance in view synthesis quality, while also being more\nuser-friendly. Project page: https://zhiwenfan.github.io/DragView/.\n","authors":["Zhiwen Fan","Panwang Pan","Peihao Wang","Yifan Jiang","Hanwen Jiang","Dejia Xu","Zehao Zhu","Dilin Wang","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03669v1","updated":"2023-10-05T16:43:28Z","published":"2023-10-05T16:43:28Z","title":"LumiNet: The Bright Side of Perceptual Knowledge Distillation","summary":"  In knowledge distillation research, feature-based methods have dominated due\nto their ability to effectively tap into extensive teacher models. In contrast,\nlogit-based approaches are considered to be less adept at extracting hidden\n'dark knowledge' from teachers. To bridge this gap, we present LumiNet, a novel\nknowledge-transfer algorithm designed to enhance logit-based distillation. We\nintroduce a perception matrix that aims to recalibrate logits through\nadjustments based on the model's representation capability. By meticulously\nanalyzing intra-class dynamics, LumiNet reconstructs more granular inter-class\nrelationships, enabling the student model to learn a richer breadth of\nknowledge. Both teacher and student models are mapped onto this refined matrix,\nwith the student's goal being to minimize representational discrepancies.\nRigorous testing on benchmark datasets (CIFAR-100, ImageNet, and MSCOCO)\nattests to LumiNet's efficacy, revealing its competitive edge over leading\nfeature-based methods. Moreover, in exploring the realm of transfer learning,\nwe assess how effectively the student model, trained using our method, adapts\nto downstream tasks. Notably, when applied to Tiny ImageNet, the transferred\nfeatures exhibit remarkable performance, further underscoring LumiNet's\nversatility and robustness in diverse settings. With LumiNet, we hope to steer\nthe research discourse towards a renewed interest in the latent capabilities of\nlogit-based knowledge distillation.\n","authors":["Md. Ismail Hossain","M M Lutfe Elahi","Sameera Ramasinghe","Ali Cheraghian","Fuad Rahman","Nabeel Mohammed","Shafin Rahman"],"pdf_url":"https://arxiv.org/pdf/2310.03669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03664v1","updated":"2023-10-05T16:40:33Z","published":"2023-10-05T16:40:33Z","title":"Certification of Deep Learning Models for Medical Image Segmentation","summary":"  In medical imaging, segmentation models have known a significant improvement\nin the past decade and are now used daily in clinical practice. However,\nsimilar to classification models, segmentation models are affected by\nadversarial attacks. In a safety-critical field like healthcare, certifying\nmodel predictions is of the utmost importance. Randomized smoothing has been\nintroduced lately and provides a framework to certify models and obtain\ntheoretical guarantees. In this paper, we present for the first time a\ncertified segmentation baseline for medical imaging based on randomized\nsmoothing and diffusion models. Our results show that leveraging the power of\ndenoising diffusion probabilistic models helps us overcome the limits of\nrandomized smoothing. We conduct extensive experiments on five public datasets\nof chest X-rays, skin lesions, and colonoscopies, and empirically show that we\nare able to maintain high certified Dice scores even for highly perturbed\nimages. Our work represents the first attempt to certify medical image\nsegmentation models, and we aspire for it to set a foundation for future\nbenchmarks in this crucial and largely uncharted area.\n","authors":["Othmane Laousy","Alexandre Araujo","Guillaume Chassagnon","Nikos Paragios","Marie-Pierre Revel","Maria Vakalopoulou"],"pdf_url":"https://arxiv.org/pdf/2310.03664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.20062v2","updated":"2023-10-05T16:40:02Z","published":"2023-05-31T17:38:08Z","title":"Chatting Makes Perfect: Chat-based Image Retrieval","summary":"  Chats emerge as an effective user-friendly approach for information\nretrieval, and are successfully employed in many domains, such as customer\nservice, healthcare, and finance. However, existing image retrieval approaches\ntypically address the case of a single query-to-image round, and the use of\nchats for image retrieval has been mostly overlooked. In this work, we\nintroduce ChatIR: a chat-based image retrieval system that engages in a\nconversation with the user to elicit information, in addition to an initial\nquery, in order to clarify the user's search intent. Motivated by the\ncapabilities of today's foundation models, we leverage Large Language Models to\ngenerate follow-up questions to an initial image description. These questions\nform a dialog with the user in order to retrieve the desired image from a large\ncorpus. In this study, we explore the capabilities of such a system tested on a\nlarge dataset and reveal that engaging in a dialog yields significant gains in\nimage retrieval. We start by building an evaluation pipeline from an existing\nmanually generated dataset and explore different modules and training\nstrategies for ChatIR. Our comparison includes strong baselines derived from\nrelated applications trained with Reinforcement Learning. Our system is capable\nof retrieving the target image from a pool of 50K images with over 78% success\nrate after 5 dialogue rounds, compared to 75% when questions are asked by\nhumans, and 64% for a single shot text-to-image retrieval. Extensive\nevaluations reveal the strong capabilities and examine the limitations of\nCharIR under different settings. Project repository is available at\nhttps://github.com/levymsn/ChatIR.\n","authors":["Matan Levy","Rami Ben-Ari","Nir Darshan","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2305.20062v2.pdf","comment":"Camera Ready version for NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.03661v1","updated":"2023-10-05T16:39:14Z","published":"2023-10-05T16:39:14Z","title":"Robustness-Guided Image Synthesis for Data-Free Quantization","summary":"  Quantization has emerged as a promising direction for model compression.\nRecently, data-free quantization has been widely studied as a promising method\nto avoid privacy concerns, which synthesizes images as an alternative to real\ntraining data. Existing methods use classification loss to ensure the\nreliability of the synthesized images. Unfortunately, even if these images are\nwell-classified by the pre-trained model, they still suffer from low semantics\nand homogenization issues. Intuitively, these low-semantic images are sensitive\nto perturbations, and the pre-trained model tends to have inconsistent output\nwhen the generator synthesizes an image with poor semantics. To this end, we\npropose Robustness-Guided Image Synthesis (RIS), a simple but effective method\nto enrich the semantics of synthetic images and improve image diversity,\nfurther boosting the performance of downstream data-free compression tasks.\nConcretely, we first introduce perturbations on input and model weight, then\ndefine the inconsistency metrics at feature and prediction levels before and\nafter perturbations. On the basis of inconsistency on two levels, we design a\nrobustness optimization objective to enhance the semantics of synthetic images.\nMoreover, we also make our approach diversity-aware by forcing the generator to\nsynthesize images with small correlations in the label space. With RIS, we\nachieve state-of-the-art performance for various settings on data-free\nquantization and can be extended to other data-free compression tasks.\n","authors":["Jianhong Bai","Yuchen Yang","Huanpeng Chu","Hualiang Wang","Zuozhu Liu","Ruizhe Chen","Xiaoxuan He","Lianrui Mu","Chengfei Cai","Haoji Hu"],"pdf_url":"https://arxiv.org/pdf/2310.03661v1.pdf","comment":"Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2310.03658v1","updated":"2023-10-05T16:35:27Z","published":"2023-10-05T16:35:27Z","title":"Visual inspection for illicit items in X-ray images using Deep Learning","summary":"  Automated detection of contraband items in X-ray images can significantly\nincrease public safety, by enhancing the productivity and alleviating the\nmental load of security officers in airports, subways, customs/post offices,\netc. The large volume and high throughput of passengers, mailed parcels, etc.,\nduring rush hours practically make it a Big Data problem. Modern computer\nvision algorithms relying on Deep Neural Networks (DNNs) have proven capable of\nundertaking this task even under resource-constrained and embedded execution\nscenarios, e.g., as is the case with fast, single-stage object detectors.\nHowever, no comparative experimental assessment of the various relevant DNN\ncomponents/methods has been performed under a common evaluation protocol, which\nmeans that reliable cross-method comparisons are missing. This paper presents\nexactly such a comparative assessment, utilizing a public relevant dataset and\na well-defined methodology for selecting the specific DNN components/modules\nthat are being evaluated. The results indicate the superiority of Transformer\ndetectors, the obsolete nature of auxiliary neural modules that have been\ndeveloped in the past few years for security applications and the efficiency of\nthe CSP-DarkNet backbone CNN.\n","authors":["Ioannis Mademlis","Georgios Batsis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.03658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.12232v2","updated":"2023-10-05T16:30:48Z","published":"2021-11-24T02:22:43Z","title":"PMSSC: Parallelizable multi-subset based self-expressive model for\n  subspace clustering","summary":"  Subspace clustering methods which embrace a self-expressive model that\nrepresents each data point as a linear combination of other data points in the\ndataset provide powerful unsupervised learning techniques. However, when\ndealing with large datasets, representation of each data point by referring to\nall data points via a dictionary suffers from high computational complexity. To\nalleviate this issue, we introduce a parallelizable multi-subset based\nself-expressive model (PMS) which represents each data point by combining\nmultiple subsets, with each consisting of only a small proportion of the\nsamples. The adoption of PMS in subspace clustering (PMSSC) leads to\ncomputational advantages because the optimization problems decomposed over each\nsubset are small, and can be solved efficiently in parallel. Furthermore, PMSSC\nis able to combine multiple self-expressive coefficient vectors obtained from\nsubsets, which contributes to an improvement in self-expressiveness. Extensive\nexperiments on synthetic and real-world datasets show the efficiency and\neffectiveness of our approach in comparison to other methods.\n","authors":["Katsuya Hotta","Takuya Akashi","Shogo Tokai","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2111.12232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13933v3","updated":"2023-10-05T16:25:41Z","published":"2023-06-24T10:44:02Z","title":"Boost Video Frame Interpolation via Motion Adaptation","summary":"  Video frame interpolation (VFI) is a challenging task that aims to generate\nintermediate frames between two consecutive frames in a video. Existing\nlearning-based VFI methods have achieved great success, but they still suffer\nfrom limited generalization ability due to the limited motion distribution of\ntraining datasets. In this paper, we propose a novel optimization-based VFI\nmethod that can adapt to unseen motions at test time. Our method is based on a\ncycle-consistency adaptation strategy that leverages the motion characteristics\namong video frames. We also introduce a lightweight adapter that can be\ninserted into the motion estimation module of existing pre-trained VFI models\nto improve the efficiency of adaptation. Extensive experiments on various\nbenchmarks demonstrate that our method can boost the performance of two-frame\nVFI models, outperforming the existing state-of-the-art methods, even those\nthat use extra input.\n","authors":["Haoning Wu","Xiaoyun Zhang","Weidi Xie","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2306.13933v3.pdf","comment":"Accepted by BMVC 2023 (Oral Presentation) Project Page:\n  https://haoningwu3639.github.io/VFI_Adapter_Webpage/"},{"id":"http://arxiv.org/abs/2306.10840v2","updated":"2023-10-05T16:13:17Z","published":"2023-06-19T10:40:09Z","title":"RedMotion: Motion Prediction via Redundancy Reduction","summary":"  Predicting the future motion of traffic agents is vital for self-driving\nvehicles to ensure their safe operation. We introduce RedMotion, a transformer\nmodel for motion prediction that incorporates two types of redundancy\nreduction. The first type of redundancy reduction is induced by an internal\ntransformer decoder and reduces a variable-sized set of road environment\ntokens, such as road graphs with agent data, to a fixed-sized embedding. The\nsecond type of redundancy reduction is a self-supervised learning objective and\napplies the redundancy reduction principle to embeddings generated from\naugmented views of road environments. Our experiments reveal that our\nrepresentation learning approach can outperform PreTraM, Traj-MAE, and\nGraphDINO in a semi-supervised setting. Our RedMotion model achieves results\nthat are competitive with those of Scene Transformer or MTR++. We provide an\nopen source implementation that is accessible via GitHub\n(https://github.com/kit-mrt/red-motion) and Colab\n(https://colab.research.google.com/drive/1Q-Z9VdiqvfPfctNG8oqzPcgm0lP3y1il).\n","authors":["Royden Wagner","Omer Sahin Tas","Marvin Klemp","Carlos Fernandez Lopez"],"pdf_url":"https://arxiv.org/pdf/2306.10840v2.pdf","comment":"Technical report, 13 pages, 8 figures; v2: focus on transformer model"},{"id":"http://arxiv.org/abs/2310.03635v1","updated":"2023-10-05T16:09:48Z","published":"2023-10-05T16:09:48Z","title":"CLEVRER-Humans: Describing Physical and Causal Events the Human Way","summary":"  Building machines that can reason about physical events and their causal\nrelationships is crucial for flexible interaction with the physical world.\nHowever, most existing physical and causal reasoning benchmarks are exclusively\nbased on synthetically generated events and synthetic natural language\ndescriptions of causal relationships. This design brings up two issues. First,\nthere is a lack of diversity in both event types and natural language\ndescriptions; second, causal relationships based on manually-defined heuristics\nare different from human judgments. To address both shortcomings, we present\nthe CLEVRER-Humans benchmark, a video reasoning dataset for causal judgment of\nphysical events with human labels. We employ two techniques to improve data\ncollection efficiency: first, a novel iterative event cloze task to elicit a\nnew representation of events in videos, which we term Causal Event Graphs\n(CEGs); second, a data augmentation technique based on neural language\ngenerative models. We convert the collected CEGs into questions and answers to\nbe consistent with prior work. Finally, we study a collection of baseline\napproaches for CLEVRER-Humans question-answering, highlighting the great\nchallenges set forth by our benchmark.\n","authors":["Jiayuan Mao","Xuelin Yang","Xikun Zhang","Noah D. Goodman","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2310.03635v1.pdf","comment":"NeurIPS 2022 (Dataset and Benchmark Track). First two authors\n  contributed equally. Project page:\n  https://sites.google.com/stanford.edu/clevrer-humans/home"},{"id":"http://arxiv.org/abs/2310.03629v1","updated":"2023-10-05T16:03:25Z","published":"2023-10-05T16:03:25Z","title":"Wasserstein Distortion: Unifying Fidelity and Realism","summary":"  We introduce a distortion measure for images, Wasserstein distortion, that\nsimultaneously generalizes pixel-level fidelity on the one hand and realism on\nthe other. We show how Wasserstein distortion reduces mathematically to a pure\nfidelity constraint or a pure realism constraint under different parameter\nchoices. Pairs of images that are close under Wasserstein distortion illustrate\nits utility. In particular, we generate random textures that have high fidelity\nto a reference texture in one location of the image and smoothly transition to\nan independent realization of the texture as one moves away from this point.\nConnections between Wasserstein distortion and models of the human visual\nsystem are noted.\n","authors":["Yang Qiu","Aaron B. Wagner","Johannes Ballé","Lucas Theis"],"pdf_url":"https://arxiv.org/pdf/2310.03629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03624v1","updated":"2023-10-05T16:01:29Z","published":"2023-10-05T16:01:29Z","title":"High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling\n  and Motion Planning","summary":"  A robot self-model is a task-agnostic representation of the robot's physical\nmorphology that can be used for motion planning tasks in absence of classical\ngeometric kinematic models. In particular, when the latter are hard to engineer\nor the robot's kinematics change unexpectedly, human-free self-modeling is a\nnecessary feature of truly autonomous agents. In this work, we leverage neural\nfields to allow a robot to self-model its kinematics as a neural-implicit query\nmodel learned only from 2D images annotated with camera poses and\nconfigurations. This enables significantly greater applicability than existing\napproaches which have been dependent on depth images or geometry knowledge. To\nthis end, alongside a curricular data sampling strategy, we propose a new\nencoder-based neural density field architecture for dynamic object-centric\nscenes conditioned on high numbers of degrees of freedom (DOFs). In a 7-DOF\nrobot test setup, the learned self-model achieves a Chamfer-L2 distance of 2%\nof the robot's workspace dimension. We demonstrate the capabilities of this\nmodel on a motion planning task as an exemplary downstream application.\n","authors":["Lennart Schulze","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2310.03624v1.pdf","comment":"ICCV 2023 Workshop on Neural Fields for Autonomous Driving and\n  Robotics (oral)"},{"id":"http://arxiv.org/abs/2310.03615v1","updated":"2023-10-05T15:49:44Z","published":"2023-10-05T15:49:44Z","title":"Animatable Virtual Humans: Learning pose-dependent human representations\n  in UV space for interactive performance synthesis","summary":"  We propose a novel representation of virtual humans for highly realistic\nreal-time animation and rendering in 3D applications. We learn pose dependent\nappearance and geometry from highly accurate dynamic mesh sequences obtained\nfrom state-of-the-art multiview-video reconstruction. Learning pose-dependent\nappearance and geometry from mesh sequences poses significant challenges, as it\nrequires the network to learn the intricate shape and articulated motion of a\nhuman body. However, statistical body models like SMPL provide valuable\na-priori knowledge which we leverage in order to constrain the dimension of the\nsearch space enabling more efficient and targeted learning and define\npose-dependency. Instead of directly learning absolute pose-dependent geometry,\nwe learn the difference between the observed geometry and the fitted SMPL\nmodel. This allows us to encode both pose-dependent appearance and geometry in\nthe consistent UV space of the SMPL model. This approach not only ensures a\nhigh level of realism but also facilitates streamlined processing and rendering\nof virtual humans in real-time scenarios.\n","authors":["Wieland Morgenstern","Milena T. Bagdasarian","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2310.03615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03608v1","updated":"2023-10-05T15:42:53Z","published":"2023-10-05T15:42:53Z","title":"How Good Are Synthetic Medical Images? An Empirical Study with Lung\n  Ultrasound","summary":"  Acquiring large quantities of data and annotations is known to be effective\nfor developing high-performing deep learning models, but is difficult and\nexpensive to do in the healthcare context. Adding synthetic training data using\ngenerative models offers a low-cost method to deal effectively with the data\nscarcity challenge, and can also address data imbalance and patient privacy\nissues. In this study, we propose a comprehensive framework that fits\nseamlessly into model development workflows for medical image analysis. We\ndemonstrate, with datasets of varying size, (i) the benefits of generative\nmodels as a data augmentation method; (ii) how adversarial methods can protect\npatient privacy via data substitution; (iii) novel performance metrics for\nthese use cases by testing models on real holdout data. We show that training\nwith both synthetic and real data outperforms training with real data alone,\nand that models trained solely with synthetic data approach their real-only\ncounterparts. Code is available at\nhttps://github.com/Global-Health-Labs/US-DCGAN.\n","authors":["Menghan Yu","Sourabh Kulhare","Courosh Mehanian","Charles B Delahunt","Daniel E Shea","Zohreh Laverriere","Ishan Shah","Matthew P Horning"],"pdf_url":"https://arxiv.org/pdf/2310.03608v1.pdf","comment":"accepted in Simulation and Synthesis in Medical Imaging (SASHIMI)"},{"id":"http://arxiv.org/abs/2310.03602v1","updated":"2023-10-05T15:29:52Z","published":"2023-10-05T15:29:52Z","title":"Ctrl-Room: Controllable Text-to-3D Room Meshes Generation with Layout\n  Constraints","summary":"  Text-driven 3D indoor scene generation could be useful for gaming, film\nindustry, and AR/VR applications. However, existing methods cannot faithfully\ncapture the room layout, nor do they allow flexible editing of individual\nobjects in the room. To address these problems, we present Ctrl-Room, which is\nable to generate convincing 3D rooms with designer-style layouts and\nhigh-fidelity textures from just a text prompt. Moreover, Ctrl-Room enables\nversatile interactive editing operations such as resizing or moving individual\nfurniture items. Our key insight is to separate the modeling of layouts and\nappearance. %how to model the room that takes into account both scene texture\nand geometry at the same time. To this end, Our proposed method consists of two\nstages, a `Layout Generation Stage' and an `Appearance Generation Stage'. The\n`Layout Generation Stage' trains a text-conditional diffusion model to learn\nthe layout distribution with our holistic scene code parameterization. Next,\nthe `Appearance Generation Stage' employs a fine-tuned ControlNet to produce a\nvivid panoramic image of the room guided by the 3D scene layout and text\nprompt. In this way, we achieve a high-quality 3D room with convincing layouts\nand lively textures. Benefiting from the scene code parameterization, we can\neasily edit the generated room model through our mask-guided editing module,\nwithout expensive editing-specific training. Extensive experiments on the\nStructured3D dataset demonstrate that our method outperforms existing methods\nin producing more reasonable, view-consistent, and editable 3D rooms from\nnatural language prompts.\n","authors":["Chuan Fang","Xiaotao Hu","Kunming Luo","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2310.03602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00616v3","updated":"2023-10-05T15:15:58Z","published":"2023-09-01T17:59:56Z","title":"OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation","summary":"  Current 3D open-vocabulary scene understanding methods mostly utilize\nwell-aligned 2D images as the bridge to learn 3D features with language.\nHowever, applying these approaches becomes challenging in scenarios where 2D\nimages are absent. In this work, we introduce a new pipeline, namely,\nOpenIns3D, which requires no 2D image inputs, for 3D open-vocabulary scene\nunderstanding at the instance level. The OpenIns3D framework employs a\n\"Mask-Snap-Lookup\" scheme. The \"Mask\" module learns class-agnostic mask\nproposals in 3D point clouds. The \"Snap\" module generates synthetic scene-level\nimages at multiple scales and leverages 2D vision language models to extract\ninteresting objects. The \"Lookup\" module searches through the outcomes of\n\"Snap\" with the help of Mask2Pixel maps, which contain the precise\ncorrespondence between 3D masks and synthetic images, to assign category names\nto the proposed masks. This 2D input-free and flexible approach achieves\nstate-of-the-art results on a wide range of indoor and outdoor datasets by a\nlarge margin. Moreover, OpenIns3D allows for effortless switching of 2D\ndetectors without re-training. When integrated with powerful 2D open-world\nmodels such as ODISE and GroundingDINO, excellent results were observed on\nopen-vocabulary instance segmentation. When integrated with LLM-powered 2D\nmodels like LISA, it demonstrates a remarkable capacity to process highly\ncomplex text queries which require intricate reasoning and world knowledge.\nProject page: https://zheninghuang.github.io/OpenIns3D/\n","authors":["Zhening Huang","Xiaoyang Wu","Xi Chen","Hengshuang Zhao","Lei Zhu","Joan Lasenby"],"pdf_url":"https://arxiv.org/pdf/2309.00616v3.pdf","comment":"28 pages, 17 figures, 13 tables. Project page:\n  https://zheninghuang.github.io/OpenIns3D/"},{"id":"http://arxiv.org/abs/2302.02394v3","updated":"2023-10-05T14:35:08Z","published":"2023-02-05T14:30:22Z","title":"Eliminating Contextual Prior Bias for Semantic Image Editing via\n  Dual-Cycle Diffusion","summary":"  The recent success of text-to-image generation diffusion models has also\nrevolutionized semantic image editing, enabling the manipulation of images\nbased on query/target texts. Despite these advancements, a significant\nchallenge lies in the potential introduction of contextual prior bias in\npre-trained models during image editing, e.g., making unexpected modifications\nto inappropriate regions. To address this issue, we present a novel approach\ncalled Dual-Cycle Diffusion, which generates an unbiased mask to guide image\nediting. The proposed model incorporates a Bias Elimination Cycle that consists\nof both a forward path and an inverted path, each featuring a Structural\nConsistency Cycle to ensure the preservation of image content during the\nediting process. The forward path utilizes the pre-trained model to produce the\nedited image, while the inverted path converts the result back to the source\nimage. The unbiased mask is generated by comparing differences between the\nprocessed source image and the edited image to ensure that both conform to the\nsame distribution. Our experiments demonstrate the effectiveness of the\nproposed method, as it significantly improves the D-CLIP score from 0.272 to\n0.283. The code will be available at\nhttps://github.com/JohnDreamer/DualCycleDiffsion.\n","authors":["Zuopeng Yang","Tianshu Chu","Xin Lin","Erdun Gao","Daqing Liu","Jie Yang","Chaoyue Wang"],"pdf_url":"https://arxiv.org/pdf/2302.02394v3.pdf","comment":"This paper has been accepted by the IEEE Transactions on Circuits and\n  Systems for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2310.03563v1","updated":"2023-10-05T14:27:06Z","published":"2023-10-05T14:27:06Z","title":"BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance\n  Fields","summary":"  We aim to improve the Inverted Neural Radiance Fields (iNeRF) algorithm which\ndefines the image pose estimation problem as a NeRF based iterative linear\noptimization. NeRFs are novel neural space representation models that can\nsynthesize photorealistic novel views of real-world scenes or objects. Our\ncontributions are as follows: we extend the localization optimization objective\nwith a depth-based loss function, we introduce a multi-image based loss\nfunction where a sequence of images with known relative poses are used without\nincreasing the computational complexity, we omit hierarchical sampling during\nvolumetric rendering, meaning only the coarse model is used for pose\nestimation, and we how that by extending the sampling interval convergence can\nbe achieved even or higher initial pose estimate errors. With the proposed\nmodifications the convergence speed is significantly improved, and the basin of\nconvergence is substantially extended.\n","authors":["Ágoston István Csehi","Csaba Máté Józsa"],"pdf_url":"https://arxiv.org/pdf/2310.03563v1.pdf","comment":"Accepted to Nerf4ADR workshop of ICCV23 conference"},{"id":"http://arxiv.org/abs/2310.03559v1","updated":"2023-10-05T14:16:22Z","published":"2023-10-05T14:16:22Z","title":"MedSynV1: Text-guided Anatomy-aware Synthesis of High-Fidelity 3D CT\n  Images","summary":"  This paper introduces an innovative methodology for producing high-quality 3D\nlung CT images guided by textual information. While diffusion-based generative\nmodels are increasingly used in medical imaging, current state-of-the-art\napproaches are limited to low-resolution outputs and underutilize radiology\nreports' abundant information. The radiology reports can enhance the generation\nprocess by providing additional guidance and offering fine-grained control over\nthe synthesis of images. Nevertheless, expanding text-guided generation to\nhigh-resolution 3D images poses significant memory and anatomical\ndetail-preserving challenges. Addressing the memory issue, we introduce a\nhierarchical scheme that uses a modified UNet architecture. We start by\nsynthesizing low-resolution images conditioned on the text, serving as a\nfoundation for subsequent generators for complete volumetric data. To ensure\nthe anatomical plausibility of the generated samples, we provide further\nguidance by generating vascular, airway, and lobular segmentation masks in\nconjunction with the CT images. The model demonstrates the capability to use\ntextual input and segmentation tasks to generate synthesized images. The\nresults of comparative assessments indicate that our approach exhibits superior\nperformance compared to the most advanced models based on GAN and diffusion\ntechniques, especially in accurately retaining crucial anatomical features such\nas fissure lines, airways, and vascular structures. This innovation introduces\nnovel possibilities. This study focuses on two main objectives: (1) the\ndevelopment of a method for creating images based on textual prompts and\nanatomical components, and (2) the capability to generate new images\nconditioning on anatomical elements. The advancements in image generation can\nbe applied to enhance numerous downstream tasks.\n","authors":["Yanwu Xu","Li Sun","Wei Peng","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2310.03559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15523v4","updated":"2023-10-05T14:13:35Z","published":"2023-09-27T09:41:36Z","title":"Improving Facade Parsing with Vision Transformers and Line Integration","summary":"  Facade parsing stands as a pivotal computer vision task with far-reaching\napplications in areas like architecture, urban planning, and energy efficiency.\nDespite the recent success of deep learning-based methods in yielding\nimpressive results on certain open-source datasets, their viability for\nreal-world applications remains uncertain. Real-world scenarios are\nconsiderably more intricate, demanding greater computational efficiency.\nExisting datasets often fall short in representing these settings, and previous\nmethods frequently rely on extra models to enhance accuracy, which requires\nmuch computation cost. In this paper, we introduce Comprehensive Facade Parsing\n(CFP), a dataset meticulously designed to encompass the intricacies of\nreal-world facade parsing tasks. Comprising a total of 602 high-resolution\nstreet-view images, this dataset captures a diverse array of challenging\nscenarios, including sloping angles and densely clustered buildings, with\npainstakingly curated annotations for each image. We introduce a new pipeline\nknown as Revision-based Transformer Facade Parsing (RTFP). This marks the\npioneering utilization of Vision Transformers (ViT) in facade parsing, and our\nexperimental results definitively substantiate its merit. We also design Line\nAcquisition, Filtering, and Revision (LAFR), an efficient yet accurate revision\nalgorithm that can improve the segment result solely from simple line detection\nusing prior knowledge of the facade. In ECP 2011, RueMonge 2014, and our CFP,\nwe evaluate the superiority of our method. The dataset and code are available\nat https://github.com/wbw520/RTFP.\n","authors":["Bowen Wang","Jiaxing Zhang","Ran Zhang","Yunqin Li","Liangzhi Li","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2309.15523v4.pdf","comment":"13 pages, 7 figures, 9 tables"},{"id":"http://arxiv.org/abs/2303.09874v3","updated":"2023-10-05T14:06:32Z","published":"2023-03-17T10:38:27Z","title":"Disentangling the Link Between Image Statistics and Human Perception","summary":"  In the 1950s, Barlow and Attneave hypothesised a link between biological\nvision and information maximisation. Following Shannon, information was defined\nusing the probability of natural images. A number of physiological and\npsychophysical phenomena have been derived ever since from principles like\ninfo-max, efficient coding, or optimal denoising. However, it remains unclear\nhow this link is expressed in mathematical terms from image probability. First,\nclassical derivations were subjected to strong assumptions on the probability\nmodels and on the behaviour of the sensors. Moreover, the direct evaluation of\nthe hypothesis was limited by the inability of the classical image models to\ndeliver accurate estimates of the probability. In this work we directly\nevaluate image probabilities using an advanced generative model for natural\nimages, and we analyse how probability-related factors can be combined to\npredict human perception via sensitivity of state-of-the-art subjective image\nquality metrics. We use information theory and regression analysis to find a\ncombination of just two probability-related factors that achieves 0.8\ncorrelation with subjective metrics. This probability-based sensitivity is\npsychophysically validated by reproducing the basic trends of the Contrast\nSensitivity Function, its suprathreshold variation, and trends of the Weber-law\nand masking.\n","authors":["Alexander Hepburn","Valero Laparra","Raúl Santos-Rodriguez","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2303.09874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16206v2","updated":"2023-10-05T14:04:00Z","published":"2023-09-28T07:06:42Z","title":"Alzheimer's Disease Prediction via Brain Structural-Functional Deep\n  Fusing Network","summary":"  Fusing structural-functional images of the brain has shown great potential to\nanalyze the deterioration of Alzheimer's disease (AD). However, it is a big\nchallenge to effectively fuse the correlated and complementary information from\nmultimodal neuroimages. In this paper, a novel model termed cross-modal\ntransformer generative adversarial network (CT-GAN) is proposed to effectively\nfuse the functional and structural information contained in functional magnetic\nresonance imaging (fMRI) and diffusion tensor imaging (DTI). The CT-GAN can\nlearn topological features and generate multimodal connectivity from multimodal\nimaging data in an efficient end-to-end manner. Moreover, the swapping\nbi-attention mechanism is designed to gradually align common features and\neffectively enhance the complementary features between modalities. By analyzing\nthe generated connectivity features, the proposed model can identify AD-related\nbrain connections. Evaluations on the public ADNI dataset show that the\nproposed CT-GAN can dramatically improve prediction performance and detect\nAD-related brain regions effectively. The proposed model also provides new\ninsights for detecting AD-related abnormal neural circuits.\n","authors":["Qiankun Zuo","Junren Pan","Shuqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2309.16206v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2310.03535v1","updated":"2023-10-05T13:35:00Z","published":"2023-10-05T13:35:00Z","title":"Towards Unified Deep Image Deraining: A Survey and A New Benchmark","summary":"  Recent years have witnessed significant advances in image deraining due to\nthe kinds of effective image priors and deep learning models. As each deraining\napproach has individual settings (e.g., training and test datasets, evaluation\ncriteria), how to fairly evaluate existing approaches comprehensively is not a\ntrivial task. Although existing surveys aim to review of image deraining\napproaches comprehensively, few of them focus on providing unify evaluation\nsettings to examine the deraining capability and practicality evaluation. In\nthis paper, we provide a comprehensive review of existing image deraining\nmethod and provide a unify evaluation setting to evaluate the performance of\nimage deraining methods. We construct a new high-quality benchmark named\nHQ-RAIN to further conduct extensive evaluation, consisting of 5,000 paired\nhigh-resolution synthetic images with higher harmony and realism. We also\ndiscuss the existing challenges and highlight several future research\nopportunities worth exploring. To facilitate the reproduction and tracking of\nthe latest deraining technologies for general users, we build an online\nplatform to provide the off-the-shelf toolkit, involving the large-scale\nperformance evaluation. This online platform and the proposed new benchmark are\npublicly available and will be regularly updated at http://www.deraining.tech/.\n","authors":["Xiang Chen","Jinshan Pan","Jiangxin Dong","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2310.03535v1.pdf","comment":"Project website: http://www.deraining.tech/"},{"id":"http://arxiv.org/abs/2310.03534v1","updated":"2023-10-05T13:34:07Z","published":"2023-10-05T13:34:07Z","title":"3D-Aware Hypothesis & Verification for Generalizable Relative Object\n  Pose Estimation","summary":"  Prior methods that tackle the problem of generalizable object pose estimation\nhighly rely on having dense views of the unseen object. By contrast, we address\nthe scenario where only a single reference view of the object is available. Our\ngoal then is to estimate the relative object pose between this reference view\nand a query image that depicts the object in a different pose. In this\nscenario, robust generalization is imperative due to the presence of unseen\nobjects during testing and the large-scale object pose variation between the\nreference and the query. To this end, we present a new\nhypothesis-and-verification framework, in which we generate and evaluate\nmultiple pose hypotheses, ultimately selecting the most reliable one as the\nrelative object pose. To measure reliability, we introduce a 3D-aware\nverification that explicitly applies 3D transformations to the 3D object\nrepresentations learned from the two input images. Our comprehensive\nexperiments on the Objaverse, LINEMOD, and CO3D datasets evidence the superior\naccuracy of our approach in relative pose estimation and its robustness in\nlarge-scale pose variations, when dealing with unseen objects.\n","authors":["Chen Zhao","Tong Zhang","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2310.03534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.09573v3","updated":"2023-10-05T13:32:57Z","published":"2022-02-19T10:52:52Z","title":"Diversity in deep generative models and generative AI","summary":"  The decoder-based machine learning generative algorithms such as Generative\nAdversarial Networks (GAN), Variational Auto-Encoders (VAE), Transformers show\nimpressive results when constructing objects similar to those in a training\nensemble. However, the generation of new objects builds mainly on the\nunderstanding of the hidden structure of the training dataset followed by a\nsampling from a multi-dimensional normal variable. In particular each sample is\nindependent from the others and can repeatedly propose same type of objects. To\ncure this drawback we introduce a kernel-based measure quantization method that\ncan produce new objects from a given target measure by approximating it as a\nwhole and even staying away from elements already drawn from that distribution.\nThis ensures a better diversity of the produced objects. The method is tested\non classic machine learning benchmarks.\n","authors":["Gabriel Turinici"],"pdf_url":"https://arxiv.org/pdf/2202.09573v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04554v2","updated":"2023-10-05T13:26:33Z","published":"2023-01-11T16:31:38Z","title":"Universal Detection of Backdoor Attacks via Density-based Clustering and\n  Centroids Analysis","summary":"  We propose a Universal Defence against backdoor attacks based on Clustering\nand Centroids Analysis (CCA-UD). The goal of the defence is to reveal whether a\nDeep Neural Network model is subject to a backdoor attack by inspecting the\ntraining dataset. CCA-UD first clusters the samples of the training set by\nmeans of density-based clustering. Then, it applies a novel strategy to detect\nthe presence of poisoned clusters. The proposed strategy is based on a general\nmisclassification behaviour observed when the features of a representative\nexample of the analysed cluster are added to benign samples. The capability of\ninducing a misclassification error is a general characteristic of poisoned\nsamples, hence the proposed defence is attack-agnostic. This marks a\nsignificant difference with respect to existing defences, that, either can\ndefend against only some types of backdoor attacks, or are effective only when\nsome conditions on the poisoning ratio or the kind of triggering signal used by\nthe attacker are satisfied.\n  Experiments carried out on several classification tasks and network\narchitectures, considering different types of backdoor attacks (with either\nclean or corrupted labels), and triggering signals, including both global and\nlocal triggering signals, as well as sample-specific and source-specific\ntriggers, reveal that the proposed method is very effective to defend against\nbackdoor attacks in all the cases, always outperforming the state of the art\ntechniques.\n","authors":["Wei Guo","Benedetta Tondi","Mauro Barni"],"pdf_url":"https://arxiv.org/pdf/2301.04554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03525v1","updated":"2023-10-05T13:19:48Z","published":"2023-10-05T13:19:48Z","title":"V2X Cooperative Perception for Autonomous Driving: Recent Advances and\n  Challenges","summary":"  Accurate perception is essential for advancing autonomous driving and\naddressing safety challenges in modern transportation systems. Despite\nsignificant advancements in computer vision for object recognition, current\nperception methods still face difficulties in complex real-world traffic\nenvironments. Challenges such as physical occlusion and limited sensor field of\nview persist for individual vehicle systems. Cooperative Perception (CP) with\nVehicle-to-Everything (V2X) technologies has emerged as a solution to overcome\nthese obstacles and enhance driving automation systems. While some research has\nexplored CP's fundamental architecture and critical components, there remains a\nlack of comprehensive summaries of the latest innovations, particularly in the\ncontext of V2X communication technologies. To address this gap, this paper\nprovides a comprehensive overview of the evolution of CP technologies, spanning\nfrom early explorations to recent developments, including advancements in V2X\ncommunication technologies. Additionally, a contemporary generic framework is\nproposed to illustrate the V2X-based CP workflow, aiding in the structured\nunderstanding of CP system components. Furthermore, this paper categorizes\nprevailing V2X-based CP methodologies based on the critical issues they\naddress. An extensive literature review is conducted within this taxonomy,\nevaluating existing datasets and simulators. Finally, open challenges and\nfuture directions in CP for autonomous driving are discussed by considering\nboth perception and V2X communication advancements.\n","authors":["Tao Huang","Jianan Liu","Xi Zhou","Dinh C. Nguyen","Mostafa Rahimi Azghadi","Yuxuan Xia","Qing-Long Han","Sumei Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03525v1.pdf","comment":"33 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.03517v1","updated":"2023-10-05T12:56:34Z","published":"2023-10-05T12:56:34Z","title":"PrototypeFormer: Learning to Explore Prototype Relationships for\n  Few-shot Image Classification","summary":"  Few-shot image classification has received considerable attention for\naddressing the challenge of poor classification performance with limited\nsamples in novel classes. However, numerous studies have employed sophisticated\nlearning strategies and diversified feature extraction methods to address this\nissue. In this paper, we propose our method called PrototypeFormer, which aims\nto significantly advance traditional few-shot image classification approaches\nby exploring prototype relationships. Specifically, we utilize a transformer\narchitecture to build a prototype extraction module, aiming to extract class\nrepresentations that are more discriminative for few-shot classification.\nAdditionally, during the model training process, we propose a contrastive\nlearning-based optimization approach to optimize prototype features in few-shot\nlearning scenarios. Despite its simplicity, the method performs remarkably\nwell, with no bells and whistles. We have experimented with our approach on\nseveral popular few-shot image classification benchmark datasets, which shows\nthat our method outperforms all current state-of-the-art methods. In\nparticular, our method achieves 97.07% and 90.88% on 5-way 5-shot and 5-way\n1-shot tasks of miniImageNet, which surpasses the state-of-the-art results with\naccuracy of 7.27% and 8.72%, respectively. The code will be released later.\n","authors":["Feihong He","Gang Li","Lingyu Si","Leilei Yan","Fanzhang Li","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03517v1.pdf","comment":"Submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2306.00966v3","updated":"2023-10-05T12:55:12Z","published":"2023-06-01T17:57:08Z","title":"The Hidden Language of Diffusion Models","summary":"  Text-to-image diffusion models have demonstrated an unparalleled ability to\ngenerate high-quality, diverse images from a textual prompt. However, the\ninternal representations learned by these models remain an enigma. In this\nwork, we present Conceptor, a novel method to interpret the internal\nrepresentation of a textual concept by a diffusion model. This interpretation\nis obtained by decomposing the concept into a small set of human-interpretable\ntextual elements. Applied over the state-of-the-art Stable Diffusion model,\nConceptor reveals non-trivial structures in the representations of concepts.\nFor example, we find surprising visual connections between concepts, that\ntranscend their textual semantics. We additionally discover concepts that rely\non mixtures of exemplars, biases, renowned artistic styles, or a simultaneous\nfusion of multiple meanings of the concept. Through a large battery of\nexperiments, we demonstrate Conceptor's ability to provide meaningful, robust,\nand faithful decompositions for a wide variety of abstract, concrete, and\ncomplex textual concepts, while allowing to naturally connect each\ndecomposition element to its corresponding visual impact on the generated\nimages. Our code will be available at: https://hila-chefer.github.io/Conceptor/\n","authors":["Hila Chefer","Oran Lang","Mor Geva","Volodymyr Polosukhin","Assaf Shocher","Michal Irani","Inbar Mosseri","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2306.00966v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17329v2","updated":"2023-10-05T12:52:09Z","published":"2023-09-29T15:40:58Z","title":"Efficient Anatomical Labeling of Pulmonary Tree Structures via Implicit\n  Point-Graph Networks","summary":"  Pulmonary diseases rank prominently among the principal causes of death\nworldwide. Curing them will require, among other things, a better understanding\nof the many complex 3D tree-shaped structures within the pulmonary system, such\nas airways, arteries, and veins. In theory, they can be modeled using\nhigh-resolution image stacks. Unfortunately, standard CNN approaches operating\non dense voxel grids are prohibitively expensive. To remedy this, we introduce\na point-based approach that preserves graph connectivity of tree skeleton and\nincorporates an implicit surface representation. It delivers SOTA accuracy at a\nlow computational cost and the resulting models have usable surfaces. Due to\nthe scarcity of publicly accessible data, we have also curated an extensive\ndataset to evaluate our approach and will make it public.\n","authors":["Kangxian Xie","Jiancheng Yang","Donglai Wei","Ziqiao Weng","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2309.17329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03513v1","updated":"2023-10-05T12:48:12Z","published":"2023-10-05T12:48:12Z","title":"Exploring DINO: Emergent Properties and Limitations for Synthetic\n  Aperture Radar Imagery","summary":"  Self-supervised learning (SSL) models have recently demonstrated remarkable\nperformance across various tasks, including image segmentation. This study\ndelves into the emergent characteristics of the Self-Distillation with No\nLabels (DINO) algorithm and its application to Synthetic Aperture Radar (SAR)\nimagery. We pre-train a vision transformer (ViT)-based DINO model using\nunlabeled SAR data, and later fine-tune the model to predict high-resolution\nland cover maps. We rigorously evaluate the utility of attention maps generated\nby the ViT backbone, and compare them with the model's token embedding space.\nWe observe a small improvement in model performance with pre-training compared\nto training from scratch, and discuss the limitations and opportunities of SSL\nfor remote sensing and land cover segmentation. Beyond small performance\nincreases, we show that ViT attention maps hold great intrinsic value for\nremote sensing, and could provide useful inputs to other algorithms. With this,\nour work lays the ground-work for bigger and better SSL models for Earth\nObservation.\n","authors":["Joseph A. Gallego-Mejia","Anna Jungbluth","Laura Martínez-Ferrer","Matt Allen","Francisco Dorr","Freddie Kalaitzis","Raúl Ramos-Pollán"],"pdf_url":"https://arxiv.org/pdf/2310.03513v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.03507v1","updated":"2023-10-05T12:39:27Z","published":"2023-10-05T12:39:27Z","title":"RL-based Stateful Neural Adaptive Sampling and Denoising for Real-Time\n  Path Tracing","summary":"  Monte-Carlo path tracing is a powerful technique for realistic image\nsynthesis but suffers from high levels of noise at low sample counts, limiting\nits use in real-time applications. To address this, we propose a framework with\nend-to-end training of a sampling importance network, a latent space encoder\nnetwork, and a denoiser network. Our approach uses reinforcement learning to\noptimize the sampling importance network, thus avoiding explicit numerically\napproximated gradients. Our method does not aggregate the sampled values per\npixel by averaging but keeps all sampled values which are then fed into the\nlatent space encoder. The encoder replaces handcrafted spatiotemporal\nheuristics by learned representations in a latent space. Finally, a neural\ndenoiser is trained to refine the output image. Our approach increases visual\nquality on several challenging datasets and reduces rendering times for equal\nquality by a factor of 1.6x compared to the previous state-of-the-art, making\nit a promising solution for real-time applications.\n","authors":["Antoine Scardigli","Lukas Cavigelli","Lorenz K. Müller"],"pdf_url":"https://arxiv.org/pdf/2310.03507v1.pdf","comment":"Submitted to NeurIPS. https://openreview.net/forum?id=xNyR7DXUzJ"},{"id":"http://arxiv.org/abs/2309.06262v2","updated":"2023-10-05T12:30:08Z","published":"2023-09-12T14:22:22Z","title":"Modality Unifying Network for Visible-Infrared Person Re-Identification","summary":"  Visible-infrared person re-identification (VI-ReID) is a challenging task due\nto large cross-modality discrepancies and intra-class variations. Existing\nmethods mainly focus on learning modality-shared representations by embedding\ndifferent modalities into the same feature space. As a result, the learned\nfeature emphasizes the common patterns across modalities while suppressing\nmodality-specific and identity-aware information that is valuable for Re-ID. To\naddress these issues, we propose a novel Modality Unifying Network (MUN) to\nexplore a robust auxiliary modality for VI-ReID. First, the auxiliary modality\nis generated by combining the proposed cross-modality learner and\nintra-modality learner, which can dynamically model the modality-specific and\nmodality-shared representations to alleviate both cross-modality and\nintra-modality variations. Second, by aligning identity centres across the\nthree modalities, an identity alignment loss function is proposed to discover\nthe discriminative feature representations. Third, a modality alignment loss is\nintroduced to consistently reduce the distribution distance of visible and\ninfrared images by modality prototype modeling. Extensive experiments on\nmultiple public datasets demonstrate that the proposed method surpasses the\ncurrent state-of-the-art methods by a significant margin.\n","authors":["Hao Yu","Xu Cheng","Wei Peng","Weihao Liu","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.06262v2.pdf","comment":"11 pages, 5 figures. Accepted as the poster paper in ICCV2023"},{"id":"http://arxiv.org/abs/2310.03502v1","updated":"2023-10-05T12:29:41Z","published":"2023-10-05T12:29:41Z","title":"Kandinsky: an Improved Text-to-Image Synthesis with Image Prior and\n  Latent Diffusion","summary":"  Text-to-image generation is a significant domain in modern computer vision\nand has achieved substantial improvements through the evolution of generative\narchitectures. Among these, there are diffusion-based models that have\ndemonstrated essential quality enhancements. These models are generally split\ninto two categories: pixel-level and latent-level approaches. We present\nKandinsky1, a novel exploration of latent diffusion architecture, combining the\nprinciples of the image prior models with latent diffusion techniques. The\nimage prior model is trained separately to map text embeddings to image\nembeddings of CLIP. Another distinct feature of the proposed model is the\nmodified MoVQ implementation, which serves as the image autoencoder component.\nOverall, the designed model contains 3.3B parameters. We also deployed a\nuser-friendly demo system that supports diverse generative modes such as\ntext-to-image generation, image fusion, text and image fusion, image variations\ngeneration, and text-guided inpainting/outpainting. Additionally, we released\nthe source code and checkpoints for the Kandinsky models. Experimental\nevaluations demonstrate a FID score of 8.03 on the COCO-30K dataset, marking\nour model as the top open-source performer in terms of measurable image\ngeneration quality.\n","authors":["Anton Razzhigaev","Arseniy Shakhmatov","Anastasia Maltseva","Vladimir Arkhipkin","Igor Pavlov","Ilya Ryabov","Angelina Kuts","Alexander Panchenko","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2310.03502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03499v1","updated":"2023-10-05T12:24:25Z","published":"2023-10-05T12:24:25Z","title":"IceCloudNet: Cirrus and mixed-phase cloud prediction from SEVIRI input\n  learned from sparse supervision","summary":"  Clouds containing ice particles play a crucial role in the climate system.\nYet they remain a source of great uncertainty in climate models and future\nclimate projections. In this work, we create a new observational constraint of\nregime-dependent ice microphysical properties at the spatio-temporal coverage\nof geostationary satellite instruments and the quality of active satellite\nretrievals. We achieve this by training a convolutional neural network on three\nyears of SEVIRI and DARDAR data sets. This work will enable novel research to\nimprove ice cloud process understanding and hence, reduce uncertainties in a\nchanging climate and help assess geoengineering methods for cirrus clouds.\n","authors":["Kai Jeggle","Mikolaj Czerkawski","Federico Serva","Bertrand Le Saux","David Neubauer","Ulrike Lohmann"],"pdf_url":"https://arxiv.org/pdf/2310.03499v1.pdf","comment":"A Preprint. Submitted to Tackling Climate Change with Machine\n  Learning: workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2304.03752v2","updated":"2023-10-05T12:18:14Z","published":"2023-04-07T17:45:35Z","title":"V3Det: Vast Vocabulary Visual Detection Dataset","summary":"  Recent advances in detecting arbitrary objects in the real world are trained\nand evaluated on object detection datasets with a relatively restricted\nvocabulary. To facilitate the development of more general visual object\ndetection, we propose V3Det, a vast vocabulary visual detection dataset with\nprecisely annotated bounding boxes on massive images. V3Det has several\nappealing properties: 1) Vast Vocabulary: It contains bounding boxes of objects\nfrom 13,204 categories on real-world images, which is 10 times larger than the\nexisting large vocabulary object detection dataset, e.g., LVIS. 2) Hierarchical\nCategory Organization: The vast vocabulary of V3Det is organized by a\nhierarchical category tree which annotates the inclusion relationship among\ncategories, encouraging the exploration of category relationships in vast and\nopen vocabulary object detection. 3) Rich Annotations: V3Det comprises\nprecisely annotated objects in 243k images and professional descriptions of\neach category written by human experts and a powerful chatbot. By offering a\nvast exploration space, V3Det enables extensive benchmarks on both vast and\nopen vocabulary object detection, leading to new observations, practices, and\ninsights for future research. It has the potential to serve as a cornerstone\ndataset for developing more general visual perception systems. V3Det is\navailable at https://v3det.openxlab.org.cn/.\n","authors":["Jiaqi Wang","Pan Zhang","Tao Chu","Yuhang Cao","Yujie Zhou","Tong Wu","Bin Wang","Conghui He","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2304.03752v2.pdf","comment":"ICCV 2023 Oral Camera Ready"},{"id":"http://arxiv.org/abs/2310.03485v1","updated":"2023-10-05T11:56:06Z","published":"2023-10-05T11:56:06Z","title":"BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic\n  Classification","summary":"  Brain tumors pose significant health challenges worldwide, with glioblastoma\nbeing one of the most aggressive forms. Accurate determination of the\nO6-methylguanine-DNA methyltransferase (MGMT) promoter methylation status is\ncrucial for personalized treatment strategies. However, traditional methods are\nlabor-intensive and time-consuming. This paper proposes a novel multi-modal\napproach, BTDNet, leveraging multi-parametric MRI scans, including FLAIR, T1w,\nT1wCE, and T2 3D volumes, to predict MGMT promoter methylation status. BTDNet\naddresses two main challenges: the variable volume lengths (i.e., each volume\nconsists of a different number of slices) and the volume-level annotations\n(i.e., the whole 3D volume is annotated and not the independent slices that it\nconsists of). BTDNet consists of four components: i) the data augmentation one\n(that performs geometric transformations, convex combinations of data pairs and\ntest-time data augmentation); ii) the 3D analysis one (that performs global\nanalysis through a CNN-RNN); iii) the routing one (that contains a mask layer\nthat handles variable input feature lengths), and iv) the modality fusion one\n(that effectively enhances data representation, reduces ambiguities and\nmitigates data scarcity). The proposed method outperforms by large margins the\nstate-of-the-art methods in the RSNA-ASNR-MICCAI BraTS 2021 Challenge, offering\na promising avenue for enhancing brain tumor diagnosis and treatment.\n","authors":["Dimitrios Kollias","Karanjot Vendal","Priyanka Gadhavi","Solomon Russom"],"pdf_url":"https://arxiv.org/pdf/2310.03485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01338v2","updated":"2023-10-05T11:55:37Z","published":"2023-03-02T15:14:46Z","title":"AdvRain: Adversarial Raindrops to Attack Camera-based Smart Vision\n  Systems","summary":"  Vision-based perception modules are increasingly deployed in many\napplications, especially autonomous vehicles and intelligent robots. These\nmodules are being used to acquire information about the surroundings and\nidentify obstacles. Hence, accurate detection and classification are essential\nto reach appropriate decisions and take appropriate and safe actions at all\ntimes. Current studies have demonstrated that \"printed adversarial attacks\",\nknown as physical adversarial attacks, can successfully mislead perception\nmodels such as object detectors and image classifiers. However, most of these\nphysical attacks are based on noticeable and eye-catching patterns for\ngenerated perturbations making them identifiable/detectable by human eye or in\ntest drives. In this paper, we propose a camera-based inconspicuous adversarial\nattack (\\textbf{AdvRain}) capable of fooling camera-based perception systems\nover all objects of the same class. Unlike mask based fake-weather attacks that\nrequire access to the underlying computing hardware or image memory, our attack\nis based on emulating the effects of a natural weather condition (i.e.,\nRaindrops) that can be printed on a translucent sticker, which is externally\nplaced over the lens of a camera. To accomplish this, we provide an iterative\nprocess based on performing a random search aiming to identify critical\npositions to make sure that the performed transformation is adversarial for a\ntarget classifier. Our transformation is based on blurring predefined parts of\nthe captured image corresponding to the areas covered by the raindrop. We\nachieve a drop in average model accuracy of more than $45\\%$ and $40\\%$ on\nVGG19 for ImageNet and Resnet34 for Caltech-101, respectively, using only $20$\nraindrops.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2303.01338v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14979v3","updated":"2023-10-05T11:53:31Z","published":"2023-05-24T10:13:32Z","title":"Assessment of the Reliablity of a Model's Decision by Generalizing\n  Attribution to the Wavelet Domain","summary":"  Neural networks have shown remarkable performance in computer vision, but\ntheir deployment in numerous scientific and technical fields is challenging due\nto their black-box nature. Scientists and practitioners need to evaluate the\nreliability of a decision, i.e., to know simultaneously if a model relies on\nthe relevant features and whether these features are robust to image\ncorruptions. Existing attribution methods aim to provide human-understandable\nexplanations by highlighting important regions in the image domain, but fail to\nfully characterize a decision process's reliability. To bridge this gap, we\nintroduce the Wavelet sCale Attribution Method (WCAM), a generalization of\nattribution from the pixel domain to the space-scale domain using wavelet\ntransforms. Attribution in the wavelet domain reveals where {\\it and} on what\nscales the model focuses, thus enabling us to assess whether a decision is\nreliable.\n","authors":["Gabriel Kasmi","Laurent Dubus","Yves-Marie Saint Drenan","Philippe Blanc"],"pdf_url":"https://arxiv.org/pdf/2305.14979v3.pdf","comment":"16 pages, 10 figures, 2 tables. v1 of the manuscript rejected from\n  NeurIPS 2023, mainly due to the lack of quantitative evidence of the\n  relevance of the proposed methodology. In the v2, we propose steps to address\n  this issue and also plan on expanding the insertion and deletion scores for\n  our method"},{"id":"http://arxiv.org/abs/2310.03472v1","updated":"2023-10-05T11:28:32Z","published":"2023-10-05T11:28:32Z","title":"Ammonia-Net: A Multi-task Joint Learning Model for Multi-class\n  Segmentation and Classification in Tooth-marked Tongue Diagnosis","summary":"  In Traditional Chinese Medicine, the tooth marks on the tongue, stemming from\nprolonged dental pressure, serve as a crucial indicator for assessing qi (yang)\ndeficiency, which is intrinsically linked to visceral health. Manual diagnosis\nof tooth-marked tongue solely relies on experience. Nonetheless, the diversity\nin shape, color, and type of tooth marks poses a challenge to diagnostic\naccuracy and consistency. To address these problems, herein we propose a\nmulti-task joint learning model named Ammonia-Net. This model employs a\nconvolutional neural network-based architecture, specifically designed for\nmulti-class segmentation and classification of tongue images. Ammonia-Net\nperforms semantic segmentation of tongue images to identify tongue and tooth\nmarks. With the assistance of segmentation output, it classifies the images\ninto the desired number of classes: healthy tongue, light tongue, moderate\ntongue, and severe tongue. As far as we know, this is the first attempt to\napply the semantic segmentation results of tooth marks for tooth-marked tongue\nclassification. To train Ammonia-Net, we collect 856 tongue images from 856\nsubjects. After a number of extensive experiments, the experimental results\nshow that the proposed model achieves 99.06% accuracy in the two-class\nclassification task of tooth-marked tongue identification and 80.02%. As for\nthe segmentation task, mIoU for tongue and tooth marks amounts to 71.65%.\n","authors":["Shunkai Shi","Yuqi Wang","Qihui Ye","Yanran Wang","Yiming Zhu","Muhammad Hassan","Aikaterini Melliou","Dongmei Yu"],"pdf_url":"https://arxiv.org/pdf/2310.03472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08303v3","updated":"2023-10-05T11:05:42Z","published":"2023-08-16T12:07:02Z","title":"Leveraging Next-Active Objects for Context-Aware Anticipation in\n  Egocentric Videos","summary":"  Objects are crucial for understanding human-object interactions. By\nidentifying the relevant objects, one can also predict potential future\ninteractions or actions that may occur with these objects. In this paper, we\nstudy the problem of Short-Term Object interaction anticipation (STA) and\npropose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a\nmulti-modal end-to-end transformer network, that attends to objects in observed\nframes in order to anticipate the next-active-object (NAO) and, eventually, to\nguide the model to predict context-aware future actions. The task is\nchallenging since it requires anticipating future action along with the object\nwith which the action occurs and the time after which the interaction will\nbegin, a.k.a. the time to contact (TTC). Compared to existing video modeling\narchitectures for action anticipation, NAOGAT captures the relationship between\nobjects and the global scene context in order to predict detections for the\nnext active object and anticipate relevant future actions given these\ndetections, leveraging the objects' dynamics to improve accuracy. One of the\nkey strengths of our approach, in fact, is its ability to exploit the motion\ndynamics of objects within a given clip, which is often ignored by other\nmodels, and separately decoding the object-centric and motion-centric\ninformation. Through our experiments, we show that our model outperforms\nexisting methods on two separate datasets, Ego4D and EpicKitchens-100 (\"Unseen\nSet\"), as measured by several additional metrics, such as time to contact, and\nnext-active-object localization. The code will be available upon acceptance.\n","authors":["Sanket Thakur","Cigdem Beyan","Pietro Morerio","Vittorio Murino","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.08303v3.pdf","comment":"Accepted in WACV'24"},{"id":"http://arxiv.org/abs/2310.03456v1","updated":"2023-10-05T10:54:33Z","published":"2023-10-05T10:54:33Z","title":"Multi-Resolution Audio-Visual Feature Fusion for Temporal Action\n  Localization","summary":"  Temporal Action Localization (TAL) aims to identify actions' start, end, and\nclass labels in untrimmed videos. While recent advancements using transformer\nnetworks and Feature Pyramid Networks (FPN) have enhanced visual feature\nrecognition in TAL tasks, less progress has been made in the integration of\naudio features into such frameworks. This paper introduces the Multi-Resolution\nAudio-Visual Feature Fusion (MRAV-FF), an innovative method to merge\naudio-visual data across different temporal resolutions. Central to our\napproach is a hierarchical gated cross-attention mechanism, which discerningly\nweighs the importance of audio information at diverse temporal scales. Such a\ntechnique not only refines the precision of regression boundaries but also\nbolsters classification confidence. Importantly, MRAV-FF is versatile, making\nit compatible with existing FPN TAL architectures and offering a significant\nenhancement in performance when audio data is available.\n","authors":["Edward Fish","Jon Weinbren","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2310.03456v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2209.12148v2","updated":"2023-10-05T10:37:39Z","published":"2022-09-25T04:56:10Z","title":"Self-Supervised Masked Convolutional Transformer Block for Anomaly\n  Detection","summary":"  Anomaly detection has recently gained increasing attention in the field of\ncomputer vision, likely due to its broad set of applications ranging from\nproduct fault detection on industrial production lines and impending event\ndetection in video surveillance to finding lesions in medical scans. Regardless\nof the domain, anomaly detection is typically framed as a one-class\nclassification task, where the learning is conducted on normal examples only.\nAn entire family of successful anomaly detection methods is based on learning\nto reconstruct masked normal inputs (e.g. patches, future frames, etc.) and\nexerting the magnitude of the reconstruction error as an indicator for the\nabnormality level. Unlike other reconstruction-based methods, we present a\nnovel self-supervised masked convolutional transformer block (SSMCTB) that\ncomprises the reconstruction-based functionality at a core architectural level.\nThe proposed self-supervised block is extremely flexible, enabling information\nmasking at any layer of a neural network and being compatible with a wide range\nof neural architectures. In this work, we extend our previous self-supervised\npredictive convolutional attentive block (SSPCAB) with a 3D masked\nconvolutional layer, a transformer for channel-wise attention, as well as a\nnovel self-supervised objective based on Huber loss. Furthermore, we show that\nour block is applicable to a wider variety of tasks, adding anomaly detection\nin medical images and thermal videos to the previously considered tasks based\non RGB images and surveillance videos. We exhibit the generality and\nflexibility of SSMCTB by integrating it into multiple state-of-the-art neural\nmodels for anomaly detection, bringing forth empirical results that confirm\nconsiderable performance improvements on five benchmarks. We release our code\nand data as open source at: https://github.com/ristea/ssmctb.\n","authors":["Neelu Madan","Nicolae-Catalin Ristea","Radu Tudor Ionescu","Kamal Nasrollahi","Fahad Shahbaz Khan","Thomas B. Moeslund","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2209.12148v2.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2310.03432v1","updated":"2023-10-05T10:17:47Z","published":"2023-10-05T10:17:47Z","title":"Mitigating the Influence of Domain Shift in Skin Lesion Classification:\n  A Benchmark Study of Unsupervised Domain Adaptation Methods on Dermoscopic\n  Images","summary":"  The potential of deep neural networks in skin lesion classification has\nalready been demonstrated to be on-par if not superior to the dermatologists\ndiagnosis. However, the performance of these models usually deteriorates when\nthe test data differs significantly from the training data (i.e. domain shift).\nThis concerning limitation for models intended to be used in real-world skin\nlesion classification tasks poses a risk to patients. For example, different\nimage acquisition systems or previously unseen anatomical sites on the patient\ncan suffice to cause such domain shifts. Mitigating the negative effect of such\nshifts is therefore crucial, but developing effective methods to address domain\nshift has proven to be challenging. In this study, we carry out an in-depth\nanalysis of eight different unsupervised domain adaptation methods to analyze\ntheir effectiveness in improving generalization for dermoscopic datasets. To\nensure robustness of our findings, we test each method on a total of ten\ndistinct datasets, thereby covering a variety of possible domain shifts. In\naddition, we investigated which factors in the domain shifted datasets have an\nimpact on the effectiveness of domain adaptation methods. Our findings show\nthat all of the eight domain adaptation methods result in improved AUPRC for\nthe majority of analyzed datasets. Altogether, these results indicate that\nunsupervised domain adaptations generally lead to performance improvements for\nthe binary melanoma-nevus classification task regardless of the nature of the\ndomain shift. However, small or heavily imbalanced datasets lead to a reduced\nconformity of the results due to the influence of these factors on the methods\nperformance.\n","authors":["Sireesha Chamarthi","Katharina Fogelberg","Roman C. Maron","Titus J. Brinker","Julia Niebling"],"pdf_url":"https://arxiv.org/pdf/2310.03432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03431v1","updated":"2023-10-05T10:17:30Z","published":"2023-10-05T10:17:30Z","title":"Robust Zero Level-Set Extraction from Unsigned Distance Fields Based on\n  Double Covering","summary":"  In this paper, we propose a new method, called DoubleCoverUDF, for extracting\nthe zero level-set from unsigned distance fields (UDFs). DoubleCoverUDF takes a\nlearned UDF and a user-specified parameter $r$ (a small positive real number)\nas input and extracts an iso-surface with an iso-value $r$ using the\nconventional marching cubes algorithm. We show that the computed iso-surface is\nthe boundary of the $r$-offset volume of the target zero level-set $S$, which\nis an orientable manifold, regardless of the topology of $S$. Next, the\nalgorithm computes a covering map to project the boundary mesh onto $S$,\npreserving the mesh's topology and avoiding folding. If $S$ is an orientable\nmanifold surface, our algorithm separates the double-layered mesh into a single\nlayer using a robust minimum-cut post-processing step. Otherwise, it keeps the\ndouble-layered mesh as the output. We validate our algorithm by reconstructing\n3D surfaces of open models and demonstrate its efficacy and effectiveness on\nsynthetic models and benchmark datasets. Our experimental results confirm that\nour method is robust and produces meshes with better quality in terms of both\nvisual evaluation and quantitative measures than existing UDF-based methods.\nThe source code is available at https://github.com/jjjkkyz/DCUDF.\n","authors":["Fei Hou","Xuhui Chen","Wencheng Wang","Hong Qin","Ying He"],"pdf_url":"https://arxiv.org/pdf/2310.03431v1.pdf","comment":"accepted to ACM Transactions on Graphics (SIGGRAPH Asia 2023)"},{"id":"http://arxiv.org/abs/2310.03420v1","updated":"2023-10-05T09:57:23Z","published":"2023-10-05T09:57:23Z","title":"FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained\n  Diffusion Models and Monocular Depth Estimators","summary":"  Matching cross-modality features between images and point clouds is a\nfundamental problem for image-to-point cloud registration. However, due to the\nmodality difference between images and points, it is difficult to learn robust\nand discriminative cross-modality features by existing metric learning methods\nfor feature matching. Instead of applying metric learning on cross-modality\ndata, we propose to unify the modality between images and point clouds by\npretrained large-scale models first, and then establish robust correspondence\nwithin the same modality. We show that the intermediate features, called\ndiffusion features, extracted by depth-to-image diffusion models are\nsemantically consistent between images and point clouds, which enables the\nbuilding of coarse but robust cross-modality correspondences. We further\nextract geometric features on depth maps produced by the monocular depth\nestimator. By matching such geometric features, we significantly improve the\naccuracy of the coarse correspondences produced by diffusion features.\nExtensive experiments demonstrate that without any task-specific training,\ndirect utilization of both features produces accurate image-to-point cloud\nregistration. On three public indoor and outdoor benchmarks, the proposed\nmethod averagely achieves a 20.6 percent improvement in Inlier Ratio, a\nthree-fold higher Inlier Number, and a 48.6 percent improvement in Registration\nRecall than existing state-of-the-arts.\n","authors":["Haiping Wang","Yuan Liu","Bing Wang","Yujing Sun","Zhen Dong","Wenping Wang","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2310.03420v1.pdf","comment":"Project Page: https://whu-usi3dv.github.io/FreeReg/"},{"id":"http://arxiv.org/abs/2303.06088v4","updated":"2023-10-05T09:55:46Z","published":"2023-03-10T17:09:04Z","title":"Towards domain-invariant Self-Supervised Learning with Batch Styles\n  Standardization","summary":"  In Self-Supervised Learning (SSL), models are typically pretrained,\nfine-tuned, and evaluated on the same domains. However, they tend to perform\npoorly when evaluated on unseen domains, a challenge that Unsupervised Domain\nGeneralization (UDG) seeks to address. Current UDG methods rely on domain\nlabels, which are often challenging to collect, and domain-specific\narchitectures that lack scalability when confronted with numerous domains,\nmaking the current methodology impractical and rigid. Inspired by\ncontrastive-based UDG methods that mitigate spurious correlations by\nrestricting comparisons to examples from the same domain, we hypothesize that\neliminating style variability within a batch could provide a more convenient\nand flexible way to reduce spurious correlations without requiring domain\nlabels. To verify this hypothesis, we introduce Batch Styles Standardization\n(BSS), a relatively simple yet powerful Fourier-based method to standardize the\nstyle of images in a batch specifically designed for integration with SSL\nmethods to tackle UDG. Combining BSS with existing SSL methods offers serious\nadvantages over prior UDG methods: (1) It eliminates the need for domain labels\nor domain-specific network components to enhance domain-invariance in SSL\nrepresentations, and (2) offers flexibility as BSS can be seamlessly integrated\nwith diverse contrastive-based but also non-contrastive-based SSL methods.\nExperiments on several UDG datasets demonstrate that it significantly improves\ndownstream task performances on unseen domains, often outperforming or rivaling\nwith UDG methods. Finally, this work clarifies the underlying mechanisms\ncontributing to BSS's effectiveness in improving domain-invariance in SSL\nrepresentations and performances on unseen domain.\n","authors":["Marin Scalbert","Maria Vakalopoulou","Florent Couzinié-Devy"],"pdf_url":"https://arxiv.org/pdf/2303.06088v4.pdf","comment":"Under review as conference paper"},{"id":"http://arxiv.org/abs/2301.04494v2","updated":"2023-10-05T09:28:57Z","published":"2023-01-11T14:42:47Z","title":"Multi-label Image Classification using Adaptive Graph Convolutional\n  Networks: from a Single Domain to Multiple Domains","summary":"  This paper proposes an adaptive graph-based approach for multi-label image\nclassification. Graph-based methods have been largely exploited in the field of\nmulti-label classification, given their ability to model label correlations.\nSpecifically, their effectiveness has been proven not only when considering a\nsingle domain but also when taking into account multiple domains. However, the\ntopology of the used graph is not optimal as it is pre-defined heuristically.\nIn addition, consecutive Graph Convolutional Network (GCN) aggregations tend to\ndestroy the feature similarity. To overcome these issues, an architecture for\nlearning the graph connectivity in an end-to-end fashion is introduced. This is\ndone by integrating an attention-based mechanism and a similarity-preserving\nstrategy. The proposed framework is then extended to multiple domains using an\nadversarial training scheme. Numerous experiments are reported on well-known\nsingle-domain and multi-domain benchmarks. The results demonstrate that our\napproach achieves competitive results in terms of mean Average Precision (mAP)\nand model size as compared to the state-of-the-art. The code will be made\npublicly available.\n","authors":["Indel Pal Singh","Enjie Ghorbel","Oyebade Oyedotun","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2301.04494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09234v4","updated":"2023-10-05T09:25:26Z","published":"2023-03-16T11:18:04Z","title":"NAISR: A 3D Neural Additive Model for Interpretable Shape Representation","summary":"  Deep implicit functions (DIFs) have emerged as a powerful paradigm for many\ncomputer vision tasks such as 3D shape reconstruction, generation,\nregistration, completion, editing, and understanding. However, given a set of\n3D shapes with associated covariates there is at present no shape\nrepresentation method which allows to precisely represent the shapes while\ncapturing the individual dependencies on each covariate. Such a method would be\nof high utility to researchers to discover knowledge hidden in a population of\nshapes. For scientific shape discovery, we propose a 3D Neural Additive Model\nfor Interpretable Shape Representation ($\\texttt{NAISR}$) which describes\nindividual shapes by deforming a shape atlas in accordance to the effect of\ndisentangled covariates. Our approach captures shape population trends and\nallows for patient-specific predictions through shape transfer.\n$\\texttt{NAISR}$ is the first approach to combine the benefits of deep implicit\nshape representations with an atlas deforming according to specified\ncovariates. We evaluate $\\texttt{NAISR}$ with respect to shape reconstruction,\nshape disentanglement, shape evolution, and shape transfer on three datasets:\n1) $\\textit{Starman}$, a simulated 2D shape dataset; 2) the ADNI hippocampus 3D\nshape dataset; and 3) a pediatric airway 3D shape dataset. Our experiments\ndemonstrate that $\\textit{Starman}$ achieves excellent shape reconstruction\nperformance while retaining interpretability. Our code is available at\n$\\href{https://github.com/uncbiag/NAISR}{https://github.com/uncbiag/NAISR}$.\n","authors":["Yining Jiao","Carlton Zdanski","Julia Kimbell","Andrew Prince","Cameron Worden","Samuel Kirse","Christopher Rutter","Benjamin Shields","William Dunn","Jisan Mahmud","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2303.09234v4.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2306.14840v2","updated":"2023-10-05T09:22:34Z","published":"2023-06-26T16:48:20Z","title":"Building Flyweight FLIM-based CNNs with Adaptive Decoding for Object\n  Detection","summary":"  State-of-the-art (SOTA) object detection methods have succeeded in several\napplications at the price of relying on heavyweight neural networks, which\nmakes them inefficient and inviable for many applications with computational\nresource constraints. This work presents a method to build a Convolutional\nNeural Network (CNN) layer by layer for object detection from user-drawn\nmarkers on discriminative regions of representative images. We address the\ndetection of Schistosomiasis mansoni eggs in microscopy images of fecal\nsamples, and the detection of ships in satellite images as application\nexamples. We could create a flyweight CNN without backpropagation from very few\ninput images. Our method explores a recent methodology, Feature Learning from\nImage Markers (FLIM), to build convolutional feature extractors (encoders) from\nmarker pixels. We extend FLIM to include a single-layer adaptive decoder, whose\nweights vary with the input image -- a concept never explored in CNNs. Our CNN\nweighs thousands of times less than SOTA object detectors, being suitable for\nCPU execution and showing superior or equivalent performance to three methods\nin five measures.\n","authors":["Leonardo de Melo Joao","Azael de Melo e Sousa","Bianca Martins dos Santos","Silvio Jamil Ferzoli Guimaraes","Jancarlo Ferreira Gomes","Ewa Kijak","Alexandre Xavier Falcao"],"pdf_url":"https://arxiv.org/pdf/2306.14840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05241v2","updated":"2023-10-05T09:20:18Z","published":"2022-08-10T09:49:19Z","title":"CANet: Channel Extending and Axial Attention Catching Network for\n  Multi-structure Kidney Segmentation","summary":"  Renal cancer is one of the most prevalent cancers worldwide. Clinical signs\nof kidney cancer include hematuria and low back discomfort, which are quite\ndistressing to the patient. Some surgery-based renal cancer treatments like\nlaparoscopic partial nephrectomy relys on the 3D kidney parsing on computed\ntomography angiography (CTA) images. Many automatic segmentation techniques\nhave been put forward to make multi-structure segmentation of the kidneys more\naccurate. The 3D visual model of kidney anatomy will help clinicians plan\noperations accurately before surgery. However, due to the diversity of the\ninternal structure of the kidney and the low grey level of the edge. It is\nstill challenging to separate the different parts of the kidney in a clear and\naccurate way. In this paper, we propose a channel extending and axial attention\ncatching Network(CANet) for multi-structure kidney segmentation. Our solution\nis founded based on the thriving nn-UNet architecture. Firstly, by extending\nthe channel size, we propose a larger network, which can provide a broader\nperspective, facilitating the extraction of complex structural information.\nSecondly, we include an axial attention catching(AAC) module in the decoder,\nwhich can obtain detailed information for refining the edges. We evaluate our\nCANet on the KiPA2022 dataset, achieving the dice scores of 95.8%, 89.1%, 87.5%\nand 84.9% for kidney, tumor, artery and vein, respectively, which helps us get\nfourth place in the challenge.\n","authors":["Zhenyu Bu","Kai-Ni Wang","Guang-Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2208.05241v2.pdf","comment":"KiPA2022 Challenge"},{"id":"http://arxiv.org/abs/2310.03402v1","updated":"2023-10-05T09:12:34Z","published":"2023-10-05T09:12:34Z","title":"A Complementary Global and Local Knowledge Network for Ultrasound\n  denoising with Fine-grained Refinement","summary":"  Ultrasound imaging serves as an effective and non-invasive diagnostic tool\ncommonly employed in clinical examinations. However, the presence of speckle\nnoise in ultrasound images invariably degrades image quality, impeding the\nperformance of subsequent tasks, such as segmentation and classification.\nExisting methods for speckle noise reduction frequently induce excessive image\nsmoothing or fail to preserve detailed information adequately. In this paper,\nwe propose a complementary global and local knowledge network for ultrasound\ndenoising with fine-grained refinement. Initially, the proposed architecture\nemploys the L-CSwinTransformer as encoder to capture global information,\nincorporating CNN as decoder to fuse local features. We expand the resolution\nof the feature at different stages to extract more global information compared\nto the original CSwinTransformer. Subsequently, we integrate Fine-grained\nRefinement Block (FRB) within the skip-connection stage to further augment\nfeatures. We validate our model on two public datasets, HC18 and BUSI.\nExperimental results demonstrate that our model can achieve competitive\nperformance in both quantitative metrics and visual performance. Our code will\nbe available at https://github.com/AAlkaid/USDenoising.\n","authors":["Zhenyu Bu","Kai-Ni Wang","Fuxing Zhao","Shengxiao Li","Guang-Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.03402v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2203.06184v4","updated":"2023-10-05T09:04:55Z","published":"2022-03-11T16:52:14Z","title":"GSDA: Generative Adversarial Network-based Semi-Supervised Data\n  Augmentation for Ultrasound Image Classification","summary":"  Medical Ultrasound (US) is one of the most widely used imaging modalities in\nclinical practice, but its usage presents unique challenges such as variable\nimaging quality. Deep Learning (DL) models can serve as advanced medical US\nimage analysis tools, but their performance is greatly limited by the scarcity\nof large datasets. To solve the common data shortage, we develop GSDA, a\nGenerative Adversarial Network (GAN)-based semi-supervised data augmentation\nmethod. GSDA consists of the GAN and Convolutional Neural Network (CNN). The\nGAN synthesizes and pseudo-labels high-resolution, high-quality US images, and\nboth real and synthesized images are then leveraged to train the CNN. To\naddress the training challenges of both GAN and CNN with limited data, we\nemploy transfer learning techniques during their training. We also introduce a\nnovel evaluation standard that balances classification accuracy with\ncomputational time. We evaluate our method on the BUSI dataset and GSDA\noutperforms existing state-of-the-art methods. With the high-resolution and\nhigh-quality images synthesized, GSDA achieves a 97.9% accuracy using merely\n780 images. Given these promising results, we believe that GSDA holds potential\nas an auxiliary tool for medical US analysis.\n","authors":["Zhaoshan Liu","Qiujie Lv","Chau Hung Lee","Lei Shen"],"pdf_url":"https://arxiv.org/pdf/2203.06184v4.pdf","comment":"Heliyon Accepted"},{"id":"http://arxiv.org/abs/2310.03396v1","updated":"2023-10-05T09:03:51Z","published":"2023-10-05T09:03:51Z","title":"Learning to Simplify Spatial-Temporal Graphs in Gait Analysis","summary":"  Gait analysis leverages unique walking patterns for person identification and\nassessment across multiple domains. Among the methods used for gait analysis,\nskeleton-based approaches have shown promise due to their robust and\ninterpretable features. However, these methods often rely on hand-crafted\nspatial-temporal graphs that are based on human anatomy disregarding the\nparticularities of the dataset and task. This paper proposes a novel method to\nsimplify the spatial-temporal graph representation for gait-based gender\nestimation, improving interpretability without losing performance. Our approach\nemploys two models, an upstream and a downstream model, that can adjust the\nadjacency matrix for each walking instance, thereby removing the fixed nature\nof the graph. By employing the Straight-Through Gumbel-Softmax trick, our model\nis trainable end-to-end. We demonstrate the effectiveness of our approach on\nthe CASIA-B dataset for gait-based gender estimation. The resulting graphs are\ninterpretable and differ qualitatively from fixed graphs used in existing\nmodels. Our research contributes to enhancing the explainability and\ntask-specific adaptability of gait recognition, promoting more efficient and\nreliable gait-based biometrics.\n","authors":["Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2310.03396v1.pdf","comment":"5 Figures, 1 Table. Short Paper"},{"id":"http://arxiv.org/abs/2211.03660v2","updated":"2023-10-05T08:53:01Z","published":"2022-11-07T16:17:47Z","title":"SC-DepthV3: Robust Self-supervised Monocular Depth Estimation for\n  Dynamic Scenes","summary":"  Self-supervised monocular depth estimation has shown impressive results in\nstatic scenes. It relies on the multi-view consistency assumption for training\nnetworks, however, that is violated in dynamic object regions and occlusions.\nConsequently, existing methods show poor accuracy in dynamic scenes, and the\nestimated depth map is blurred at object boundaries because they are usually\noccluded in other training views. In this paper, we propose SC-DepthV3 for\naddressing the challenges. Specifically, we introduce an external pretrained\nmonocular depth estimation model for generating single-image depth prior,\nnamely pseudo-depth, based on which we propose novel losses to boost\nself-supervised training. As a result, our model can predict sharp and accurate\ndepth maps, even when training from monocular videos of highly-dynamic scenes.\nWe demonstrate the significantly superior performance of our method over\nprevious methods on six challenging datasets, and we provide detailed ablation\nstudies for the proposed terms. Source code and data will be released at\nhttps://github.com/JiawangBian/sc_depth_pl\n","authors":["Libo Sun","Jia-Wang Bian","Huangying Zhan","Wei Yin","Ian Reid","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2211.03660v2.pdf","comment":"Accepted for publication in TPAMI; The code will be available at\n  https://github.com/JiawangBian/sc_depth_pl"},{"id":"http://arxiv.org/abs/2310.03388v1","updated":"2023-10-05T08:49:51Z","published":"2023-10-05T08:49:51Z","title":"OpenPatch: a 3D patchwork for Out-Of-Distribution detectionpdf icon","summary":"  Moving deep learning models from the laboratory setting to the open world\nentails preparing them to handle unforeseen conditions. In several applications\nthe occurrence of novel classes during deployment poses a significant threat,\nthus it is crucial to effectively detect them. Ideally, this skill should be\nused when needed without requiring any further computational training effort at\nevery new task. Out-of-distribution detection has attracted significant\nattention in the last years, however the majority of the studies deal with 2D\nimages ignoring the inherent 3D nature of the real-world and often confusing\nbetween domain and semantic novelty. In this work, we focus on the latter,\nconsidering the objects geometric structure captured by 3D point clouds\nregardless of the specific domain. We advance the field by introducing\nOpenPatch that builds on a large pre-trained model and simply extracts from its\nintermediate features a set of patch representations that describe each known\nclass. For any new sample, we obtain a novelty score by evaluating whether it\ncan be recomposed mainly by patches of a single known class or rather via the\ncontribution of multiple classes. We present an extensive experimental\nevaluation of our approach for the task of semantic novelty detection on\nreal-world point cloud samples when the reference known data are synthetic. We\ndemonstrate that OpenPatch excels in both the full and few-shot known sample\nscenarios, showcasing its robustness across varying pre-training objectives and\nnetwork backbones. The inherent training-free nature of our method allows for\nits immediate application to a wide array of real-world tasks, offering a\ncompelling advantage over approaches that need expensive retraining efforts.\n","authors":["Paolo Rabino","Antonio Alliegro","Francesco Cappio Borlino","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2310.03388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03377v1","updated":"2023-10-05T08:28:26Z","published":"2023-10-05T08:28:26Z","title":"ACT-Net: Anchor-context Action Detection in Surgery Videos","summary":"  Recognition and localization of surgical detailed actions is an essential\ncomponent of developing a context-aware decision support system. However, most\nexisting detection algorithms fail to provide high-accuracy action classes even\nhaving their locations, as they do not consider the surgery procedure's\nregularity in the whole video. This limitation hinders their application.\nMoreover, implementing the predictions in clinical applications seriously needs\nto convey model confidence to earn entrustment, which is unexplored in surgical\naction prediction. In this paper, to accurately detect fine-grained actions\nthat happen at every moment, we propose an anchor-context action detection\nnetwork (ACTNet), including an anchor-context detection (ACD) module and a\nclass conditional diffusion (CCD) module, to answer the following questions: 1)\nwhere the actions happen; 2) what actions are; 3) how confidence predictions\nare. Specifically, the proposed ACD module spatially and temporally highlights\nthe regions interacting with the extracted anchor in surgery video, which\noutputs action location and its class distribution based on anchor-context\ninteractions. Considering the full distribution of action classes in videos,\nthe CCD module adopts a denoising diffusion-based generative model conditioned\non our ACD estimator to further reconstruct accurately the action predictions.\nMoreover, we utilize the stochastic nature of the diffusion model outputs to\naccess model confidence for each prediction. Our method reports the\nstate-of-the-art performance, with improvements of 4.0% mAP against baseline on\nthe surgical video dataset.\n","authors":["Luoying Hao","Yan Hu","Wenjun Lin","Qun Wang","Heng Li","Huazhu Fu","Jinming Duan","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.03377v1.pdf","comment":"Accepted early by MICCAI2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.03375v1","updated":"2023-10-05T08:27:33Z","published":"2023-10-05T08:27:33Z","title":"Point-Based Radiance Fields for Controllable Human Motion Synthesis","summary":"  This paper proposes a novel controllable human motion synthesis method for\nfine-level deformation based on static point-based radiance fields. Although\nprevious editable neural radiance field methods can generate impressive results\non novel-view synthesis and allow naive deformation, few algorithms can achieve\ncomplex 3D human editing such as forward kinematics. Our method exploits the\nexplicit point cloud to train the static 3D scene and apply the deformation by\nencoding the point cloud translation using a deformation MLP. To make sure the\nrendering result is consistent with the canonical space training, we estimate\nthe local rotation using SVD and interpolate the per-point rotation to the\nquery view direction of the pre-trained radiance field. Extensive experiments\nshow that our approach can significantly outperform the state-of-the-art on\nfine-level complex deformation which can be generalized to other 3D characters\nbesides humans.\n","authors":["Haitao Yu","Deheng Zhang","Peiyuan Xie","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.03375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07091v2","updated":"2023-10-05T07:59:22Z","published":"2022-11-14T03:36:38Z","title":"BiViT: Extremely Compressed Binary Vision Transformer","summary":"  Model binarization can significantly compress model size, reduce energy\nconsumption, and accelerate inference through efficient bit-wise operations.\nAlthough binarizing convolutional neural networks have been extensively\nstudied, there is little work on exploring binarization of vision Transformers\nwhich underpin most recent breakthroughs in visual recognition. To this end, we\npropose to solve two fundamental challenges to push the horizon of Binary\nVision Transformers (BiViT). First, the traditional binary method does not take\nthe long-tailed distribution of softmax attention into consideration, bringing\nlarge binarization errors in the attention module. To solve this, we propose\nSoftmax-aware Binarization, which dynamically adapts to the data distribution\nand reduces the error caused by binarization. Second, to better preserve the\ninformation of the pretrained model and restore accuracy, we propose a\nCross-layer Binarization scheme that decouples the binarization of\nself-attention and multi-layer perceptrons (MLPs), and Parameterized Weight\nScales which introduce learnable scaling factors for weight binarization.\nOverall, our method performs favorably against state-of-the-arts by 19.8% on\nthe TinyImageNet dataset. On ImageNet, our BiViT achieves a competitive 75.6%\nTop-1 accuracy over Swin-S model. Additionally, on COCO object detection, our\nmethod achieves an mAP of 40.8 with a Swin-T backbone over Cascade Mask R-CNN\nframework.\n","authors":["Yefei He","Zhenyu Lou","Luoming Zhang","Jing Liu","Weijia Wu","Hong Zhou","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2211.07091v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2310.03365v1","updated":"2023-10-05T07:48:55Z","published":"2023-10-05T07:48:55Z","title":"Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video\n  Sequences Using Swin Transformer-Enhanced UNet","summary":"  Lung cancer is highly lethal, emphasizing the critical need for early\ndetection. However, identifying lung nodules poses significant challenges for\nradiologists, who rely heavily on their expertise and experience for accurate\ndiagnosis. To address this issue, computer-aided diagnosis systems based on\nmachine learning techniques have emerged to assist doctors in identifying lung\nnodules from computed tomography (CT) scans. Unfortunately, existing networks\nin this domain often suffer from computational complexity, leading to high\nrates of false negatives and false positives, limiting their effectiveness. To\naddress these challenges, we present an innovative model that harnesses the\nstrengths of both convolutional neural networks and vision transformers.\nInspired by object detection in videos, we treat each 3D CT image as a video,\nindividual slices as frames, and lung nodules as objects, enabling a\ntime-series application. The primary objective of our work is to overcome\nhardware limitations during model training, allowing for efficient processing\nof 2D data while utilizing inter-slice information for accurate identification\nbased on 3D image context. We validated the proposed network by applying a\n10-fold cross-validation technique to the publicly available Lung Nodule\nAnalysis 2016 dataset. Our proposed architecture achieves an average\nsensitivity criterion of 97.84% and a competition performance metrics (CPM) of\n96.0% with few parameters. Comparative analysis with state-of-the-art\nadvancements in lung nodule identification demonstrates the significant\naccuracy achieved by our proposed model.\n","authors":["Hossein Jafari","Karim Faez","Hamidreza Amindavar"],"pdf_url":"https://arxiv.org/pdf/2310.03365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03363v1","updated":"2023-10-05T07:44:49Z","published":"2023-10-05T07:44:49Z","title":"Realistic Speech-to-Face Generation with Speech-Conditioned Latent\n  Diffusion Model with Face Prior","summary":"  Speech-to-face generation is an intriguing area of research that focuses on\ngenerating realistic facial images based on a speaker's audio speech. However,\nstate-of-the-art methods employing GAN-based architectures lack stability and\ncannot generate realistic face images. To fill this gap, we propose a novel\nspeech-to-face generation framework, which leverages a Speech-Conditioned\nLatent Diffusion Model, called SCLDM. To the best of our knowledge, this is the\nfirst work to harness the exceptional modeling capabilities of diffusion models\nfor speech-to-face generation. Preserving the shared identity information\nbetween speech and face is crucial in generating realistic results. Therefore,\nwe employ contrastive pre-training for both the speech encoder and the face\nencoder. This pre-training strategy facilitates effective alignment between the\nattributes of speech, such as age and gender, and the corresponding facial\ncharacteristics in the face images. Furthermore, we tackle the challenge posed\nby excessive diversity in the synthesis process caused by the diffusion model.\nTo overcome this challenge, we introduce the concept of residuals by\nintegrating a statistical face prior to the diffusion process. This addition\nhelps to eliminate the shared component across the faces and enhances the\nsubtle variations captured by the speech condition. Extensive quantitative,\nqualitative, and user study experiments demonstrate that our method can produce\nmore realistic face images while preserving the identity of the speaker better\nthan state-of-the-art methods. Highlighting the notable enhancements, our\nmethod demonstrates significant gains in all metrics on the AVSpeech dataset\nand Voxceleb dataset, particularly noteworthy are the improvements of 32.17 and\n32.72 on the cosine distance metric for the two datasets, respectively.\n","authors":["Jinting Wang","Li Liu","Jun Wang","Hei Victor Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.03363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10784v3","updated":"2023-10-05T07:43:33Z","published":"2023-07-20T11:33:46Z","title":"SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with\n  4D Imaging Radar","summary":"  The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle\nsensing due to its cost-effectiveness and operability in adverse weather\nconditions. However, the adoption of this technology has been hindered by\nsparsity and noise issues in radar point cloud data. This paper introduces\nspatial multi-representation fusion (SMURF), a novel approach to 3D object\ndetection using a single 4D imaging radar. SMURF leverages multiple\nrepresentations of radar detection points, including pillarization and density\nfeatures of a multi-dimensional Gaussian mixture distribution through kernel\ndensity estimation (KDE). KDE effectively mitigates measurement inaccuracy\ncaused by limited angular resolution and multi-path propagation of radar\nsignals. Additionally, KDE helps alleviate point cloud sparsity by capturing\ndensity features. Experimental evaluations on View-of-Delft (VoD) and\nTJ4DRadSet datasets demonstrate the effectiveness and generalization ability of\nSMURF, outperforming recently proposed 4D imaging radar-based\nsingle-representation models. Moreover, while using 4D imaging radar only,\nSMURF still achieves comparable performance to the state-of-the-art 4D imaging\nradar and camera fusion-based method, with an increase of 1.22% in the mean\naverage precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D\nmean average precision on the entire annotated area of VoD dataset. Our\nproposed method demonstrates impressive inference time and addresses the\nchallenges of real-time detection, with the inference time no more than 0.05\nseconds for most scans on both datasets. This research highlights the benefits\nof 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D\nobject detection with 4D imaging radar.\n","authors":["Jianan Liu","Qiuchi Zhao","Weiyi Xiong","Tao Huang","Qing-Long Han","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.10784v3.pdf","comment":"Accepted by IEEE Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2310.03360v1","updated":"2023-10-05T07:30:52Z","published":"2023-10-05T07:30:52Z","title":"CSI: Enhancing the Robustness of 3D Point Cloud Recognition against\n  Corruption","summary":"  Despite recent advancements in deep neural networks for point cloud\nrecognition, real-world safety-critical applications present challenges due to\nunavoidable data corruption. Current models often fall short in generalizing to\nunforeseen distribution shifts. In this study, we harness the inherent set\nproperty of point cloud data to introduce a novel critical subset\nidentification (CSI) method, aiming to bolster recognition robustness in the\nface of data corruption. Our CSI framework integrates two pivotal components:\ndensity-aware sampling (DAS) and self-entropy minimization (SEM), which cater\nto static and dynamic CSI, respectively. DAS ensures efficient robust anchor\npoint sampling by factoring in local density, while SEM is employed during\ntraining to accentuate the most salient point-to-point attention. Evaluations\nreveal that our CSI approach yields error rates of 18.4\\% and 16.3\\% on\nModelNet40-C and PointCloud-C, respectively, marking a notable improvement over\nstate-of-the-art methods by margins of 5.2\\% and 4.2\\% on the respective\nbenchmarks. Code is available at\n\\href{https://github.com/masterwu2115/CSI/tree/main}{https://github.com/masterwu2115/CSI/tree/main}\n","authors":["Zhuoyuan Wu","Jiachen Sun","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.03360v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.03358v1","updated":"2023-10-05T07:29:29Z","published":"2023-10-05T07:29:29Z","title":"Robust Representation Learning via Asymmetric Negative Contrast and\n  Reverse Attention","summary":"  Deep neural networks are vulnerable to adversarial noise. Adversarial\ntraining (AT) has been demonstrated to be the most effective defense strategy\nto protect neural networks from being fooled. However, we find AT omits to\nlearning robust features, resulting in poor performance of adversarial\nrobustness. To address this issue, we highlight two characteristics of robust\nrepresentation: (1) $\\bf{exclusion}$: the feature of natural examples keeps\naway from that of other classes; (2) $\\bf{alignment}$: the feature of natural\nand corresponding adversarial examples is close to each other. These motivate\nus to propose a generic framework of AT to gain robust representation, by the\nasymmetric negative contrast and reverse attention. Specifically, we design an\nasymmetric negative contrast based on predicted probabilities, to push away\nexamples of different classes in the feature space. Moreover, we propose to\nweight feature by parameters of the linear classifier as the reverse attention,\nto obtain class-aware feature and pull close the feature of the same class.\nEmpirical evaluations on three benchmark datasets show our methods greatly\nadvance the robustness of AT and achieve state-of-the-art performance. Code is\navailable at <https://github.com/changzhang777/ANCRA>.\n","authors":["Nuoyan Zhou","Decheng Liu","Dawei Zhou","Xinbo Gao","Nannan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03358v1.pdf","comment":"Submitted to ICLR2024"},{"id":"http://arxiv.org/abs/2310.02601v2","updated":"2023-10-05T07:07:38Z","published":"2023-10-04T06:14:06Z","title":"MagicDrive: Street View Generation with Diverse 3D Geometry Control","summary":"  Recent advancements in diffusion models have significantly enhanced the data\nsynthesis with 2D control. Yet, precise 3D control in street view generation,\ncrucial for 3D perception tasks, remains elusive. Specifically, utilizing\nBird's-Eye View (BEV) as the primary condition often leads to challenges in\ngeometry control (e.g., height), affecting the representation of object shapes,\nocclusion patterns, and road surface elevations, all of which are essential to\nperception data synthesis, especially for 3D object detection tasks. In this\npaper, we introduce MagicDrive, a novel street view generation framework\noffering diverse 3D geometry controls, including camera poses, road maps, and\n3D bounding boxes, together with textual descriptions, achieved through\ntailored encoding strategies. Besides, our design incorporates a cross-view\nattention module, ensuring consistency across multiple camera views. With\nMagicDrive, we achieve high-fidelity street-view synthesis that captures\nnuanced 3D geometry and various scene descriptions, enhancing tasks like BEV\nsegmentation and 3D object detection.\n","authors":["Ruiyuan Gao","Kai Chen","Enze Xie","Lanqing Hong","Zhenguo Li","Dit-Yan Yeung","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02601v2.pdf","comment":"Project Page: https://flymin.github.io/magicdrive"},{"id":"http://arxiv.org/abs/2306.03364v3","updated":"2023-10-05T07:06:27Z","published":"2023-06-06T02:38:01Z","title":"Learning Representations on the Unit Sphere: Investigating Angular\n  Gaussian and von Mises-Fisher Distributions for Online Continual Learning","summary":"  We use the maximum a posteriori estimation principle for learning\nrepresentations distributed on the unit sphere. We propose to use the angular\nGaussian distribution, which corresponds to a Gaussian projected on the\nunit-sphere and derive the associated loss function. We also consider the von\nMises-Fisher distribution, which is the conditional of a Gaussian in the\nunit-sphere. The learned representations are pushed toward fixed directions,\nwhich are the prior means of the Gaussians; allowing for a learning strategy\nthat is resilient to data drift. This makes it suitable for online continual\nlearning, which is the problem of training neural networks on a continuous data\nstream, where multiple classification tasks are presented sequentially so that\ndata from past tasks are no longer accessible, and data from the current task\ncan be seen only once. To address this challenging scenario, we propose a\nmemory-based representation learning technique equipped with our new loss\nfunctions. Our approach does not require negative data or knowledge of task\nboundaries and performs well with smaller batch sizes while being\ncomputationally efficient. We demonstrate with extensive experiments that the\nproposed method outperforms the current state-of-the-art methods on both\nstandard evaluation scenarios and realistic scenarios with blurry task\nboundaries. For reproducibility, we use the same training pipeline for every\ncompared method and share the code at https://t.ly/SQTj.\n","authors":["Nicolas Michel","Giovanni Chierchia","Romain Negrel","Jean-François Bercher"],"pdf_url":"https://arxiv.org/pdf/2306.03364v3.pdf","comment":"17 pages, under review, update title"},{"id":"http://arxiv.org/abs/2310.03346v1","updated":"2023-10-05T06:56:54Z","published":"2023-10-05T06:56:54Z","title":"Combining Datasets with Different Label Sets for Improved Nucleus\n  Segmentation and Classification","summary":"  Segmentation and classification of cell nuclei in histopathology images using\ndeep neural networks (DNNs) can save pathologists' time for diagnosing various\ndiseases, including cancers, by automating cell counting and morphometric\nassessments. It is now well-known that the accuracy of DNNs increases with the\nsizes of annotated datasets available for training. Although multiple datasets\nof histopathology images with nuclear annotations and class labels have been\nmade publicly available, the set of class labels differ across these datasets.\nWe propose a method to train DNNs for instance segmentation and classification\non multiple datasets where the set of classes across the datasets are related\nbut not the same. Specifically, our method is designed to utilize a\ncoarse-to-fine class hierarchy, where the set of classes labeled and annotated\nin a dataset can be at any level of the hierarchy, as long as the classes are\nmutually exclusive. Within a dataset, the set of classes need not even be at\nthe same level of the class hierarchy tree. Our results demonstrate that\nsegmentation and classification metrics for the class set used by the test\nsplit of a dataset can improve by pre-training on another dataset that may even\nhave a different set of classes due to the expansion of the training set\nenabled by our method. Furthermore, generalization to previously unseen\ndatasets also improves by combining multiple other datasets with different sets\nof classes for training. The improvement is both qualitative and quantitative.\nThe proposed method can be adapted for various loss functions, DNN\narchitectures, and application domains.\n","authors":["Amruta Parulekar","Utkarsh Kanwat","Ravi Kant Gupta","Medha Chippa","Thomas Jacob","Tripti Bameta","Swapnil Rane","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2310.03346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14655v2","updated":"2023-10-05T06:55:13Z","published":"2023-03-26T08:43:36Z","title":"GOAL: A Challenging Knowledge-grounded Video Captioning Benchmark for\n  Real-time Soccer Commentary Generation","summary":"  Despite the recent emergence of video captioning models, how to generate\nvivid, fine-grained video descriptions based on the background knowledge (i.e.,\nlong and informative commentary about the domain-specific scenes with\nappropriate reasoning) is still far from being solved, which however has great\napplications such as automatic sports narrative. In this paper, we present\nGOAL, a benchmark of over 8.9k soccer video clips, 22k sentences, and 42k\nknowledge triples for proposing a challenging new task setting as\nKnowledge-grounded Video Captioning (KGVC). Moreover, we conduct experimental\nadaption of existing methods to show the difficulty and potential directions\nfor solving this valuable and applicable task. Our data and code are available\nat https://github.com/THU-KEG/goal.\n","authors":["Ji Qi","Jifan Yu","Teng Tu","Kunyu Gao","Yifan Xu","Xinyu Guan","Xiaozhi Wang","Yuxiao Dong","Bin Xu","Lei Hou","Juanzi Li","Jie Tang","Weidong Guo","Hui Liu","Yu Xu"],"pdf_url":"https://arxiv.org/pdf/2303.14655v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2310.03337v1","updated":"2023-10-05T06:44:13Z","published":"2023-10-05T06:44:13Z","title":"Denoising Diffusion Step-aware Models","summary":"  Denoising Diffusion Probabilistic Models (DDPMs) have garnered popularity for\ndata generation across various domains. However, a significant bottleneck is\nthe necessity for whole-network computation during every step of the generative\nprocess, leading to high computational overheads. This paper presents a novel\nframework, Denoising Diffusion Step-aware Models (DDSM), to address this\nchallenge. Unlike conventional approaches, DDSM employs a spectrum of neural\nnetworks whose sizes are adapted according to the importance of each generative\nstep, as determined through evolutionary search. This step-wise network\nvariation effectively circumvents redundant computational efforts, particularly\nin less critical steps, thereby enhancing the efficiency of the diffusion\nmodel. Furthermore, the step-aware design can be seamlessly integrated with\nother efficiency-geared diffusion models such as DDIMs and latent diffusion,\nthus broadening the scope of computational savings. Empirical evaluations\ndemonstrate that DDSM achieves computational savings of 49% for CIFAR-10, 61%\nfor CelebA-HQ, 59% for LSUN-bedroom, 71% for AFHQ, and 76% for ImageNet, all\nwithout compromising the generation quality. Our code and models will be\npublicly available.\n","authors":["Shuai Yang","Yukang Chen","Luozhou Wang","Shu Liu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00944v2","updated":"2023-10-05T06:37:23Z","published":"2023-10-02T07:34:15Z","title":"Towards Robust 3D Object Detection In Rainy Conditions","summary":"  LiDAR sensors are used in autonomous driving applications to accurately\nperceive the environment. However, they are affected by adverse weather\nconditions such as snow, fog, and rain. These everyday phenomena introduce\nunwanted noise into the measurements, severely degrading the performance of\nLiDAR-based perception systems. In this work, we propose a framework for\nimproving the robustness of LiDAR-based 3D object detectors against road spray.\nOur approach uses a state-of-the-art adverse weather detection network to\nfilter out spray from the LiDAR point cloud, which is then used as input for\nthe object detector. In this way, the detected objects are less affected by the\nadverse weather in the scene, resulting in a more accurate perception of the\nenvironment. In addition to adverse weather filtering, we explore the use of\nradar targets to further filter false positive detections. Tests on real-world\ndata show that our approach improves the robustness to road spray of several\npopular 3D object detectors.\n","authors":["Aldi Piroli","Vinzenz Dallabetta","Johannes Kopp","Marc Walessa","Daniel Meissner","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2310.00944v2.pdf","comment":"Published at IEEE International Conference on Intelligent\n  Transportation Systems ITSC 2023"},{"id":"http://arxiv.org/abs/2310.03335v1","updated":"2023-10-05T06:35:21Z","published":"2023-10-05T06:35:21Z","title":"Continual Test-time Domain Adaptation via Dynamic Sample Selection","summary":"  The objective of Continual Test-time Domain Adaptation (CTDA) is to gradually\nadapt a pre-trained model to a sequence of target domains without accessing the\nsource data. This paper proposes a Dynamic Sample Selection (DSS) method for\nCTDA. DSS consists of dynamic thresholding, positive learning, and negative\nlearning processes. Traditionally, models learn from unlabeled unknown\nenvironment data and equally rely on all samples' pseudo-labels to update their\nparameters through self-training. However, noisy predictions exist in these\npseudo-labels, so all samples are not equally trustworthy. Therefore, in our\nmethod, a dynamic thresholding module is first designed to select suspected\nlow-quality from high-quality samples. The selected low-quality samples are\nmore likely to be wrongly predicted. Therefore, we apply joint positive and\nnegative learning on both high- and low-quality samples to reduce the risk of\nusing wrong information. We conduct extensive experiments that demonstrate the\neffectiveness of our proposed method for CTDA in the image domain,\noutperforming the state-of-the-art results. Furthermore, our approach is also\nevaluated in the 3D point cloud domain, showcasing its versatility and\npotential for broader applicability.\n","authors":["Yanshuo Wang","Jie Hong","Ali Cheraghian","Shafin Rahman","David Ahmedt-Aristizabal","Lars Petersson","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2310.03335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03333v1","updated":"2023-10-05T06:31:38Z","published":"2023-10-05T06:31:38Z","title":"Real-time Multi-modal Object Detection and Tracking on Edge for\n  Regulatory Compliance Monitoring","summary":"  Regulatory compliance auditing across diverse industrial domains requires\nheightened quality assurance and traceability. Present manual and intermittent\napproaches to such auditing yield significant challenges, potentially leading\nto oversights in the monitoring process. To address these issues, we introduce\na real-time, multi-modal sensing system employing 3D time-of-flight and RGB\ncameras, coupled with unsupervised learning techniques on edge AI devices. This\nenables continuous object tracking thereby enhancing efficiency in\nrecord-keeping and minimizing manual interventions. While we validate the\nsystem in a knife sanitization context within agrifood facilities, emphasizing\nits prowess against occlusion and low-light issues with RGB cameras, its\npotential spans various industrial monitoring settings.\n","authors":["Jia Syuen Lim","Ziwei Wang","Jiajun Liu","Abdelwahed Khamis","Reza Arablouei","Robert Barlow","Ryan McAllister"],"pdf_url":"https://arxiv.org/pdf/2310.03333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03006v2","updated":"2023-10-05T05:54:34Z","published":"2023-10-04T17:49:48Z","title":"COOLer: Class-Incremental Learning for Appearance-Based Multiple Object\n  Tracking","summary":"  Continual learning allows a model to learn multiple tasks sequentially while\nretaining the old knowledge without the training data of the preceding tasks.\nThis paper extends the scope of continual learning research to\nclass-incremental learning for multiple object tracking (MOT), which is\ndesirable to accommodate the continuously evolving needs of autonomous systems.\nPrevious solutions for continual learning of object detectors do not address\nthe data association stage of appearance-based trackers, leading to\ncatastrophic forgetting of previous classes' re-identification features. We\nintroduce COOLer, a COntrastive- and cOntinual-Learning-based tracker, which\nincrementally learns to track new categories while preserving past knowledge by\ntraining on a combination of currently available ground truth labels and\npseudo-labels generated by the past tracker. To further exacerbate the\ndisentanglement of instance representations, we introduce a novel contrastive\nclass-incremental instance representation learning technique. Finally, we\npropose a practical evaluation protocol for continual learning for MOT and\nconduct experiments on the BDD100K and SHIFT datasets. Experimental results\ndemonstrate that COOLer continually learns while effectively addressing\ncatastrophic forgetting of both tracking and detection. The code is available\nat https://github.com/BoSmallEar/COOLer.\n","authors":["Zhizheng Liu","Mattia Segu","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2310.03006v2.pdf","comment":"GCPR 2023 Oral"},{"id":"http://arxiv.org/abs/2310.03325v1","updated":"2023-10-05T05:41:21Z","published":"2023-10-05T05:41:21Z","title":"Learning Concept-Based Visual Causal Transition and Symbolic Reasoning\n  for Visual Planning","summary":"  Visual planning simulates how humans make decisions to achieve desired goals\nin the form of searching for visual causal transitions between an initial\nvisual state and a final visual goal state. It has become increasingly\nimportant in egocentric vision with its advantages in guiding agents to perform\ndaily tasks in complex environments. In this paper, we propose an interpretable\nand generalizable visual planning framework consisting of i) a novel\nSubstitution-based Concept Learner (SCL) that abstracts visual inputs into\ndisentangled concept representations, ii) symbol abstraction and reasoning that\nperforms task planning via the self-learned symbols, and iii) a Visual Causal\nTransition model (ViCT) that grounds visual causal transitions to semantically\nsimilar real-world actions. Given an initial state, we perform goal-conditioned\nvisual planning with a symbolic reasoning method fueled by the learned\nrepresentations and causal transitions to reach the goal state. To verify the\neffectiveness of the proposed model, we collect a large-scale visual planning\ndataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this\nchallenging dataset demonstrate the superior performance of our method in\nvisual task planning. Empirically, we show that our framework can generalize to\nunseen task trajectories and unseen object categories.\n","authors":["Yilue Qian","Peiyu Yu","Ying Nian Wu","Wei Wang","Lifeng Fan"],"pdf_url":"https://arxiv.org/pdf/2310.03325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03324v1","updated":"2023-10-05T05:37:33Z","published":"2023-10-05T05:37:33Z","title":"Investigating the Limitation of CLIP Models: The Worst-Performing\n  Categories","summary":"  Contrastive Language-Image Pre-training (CLIP) provides a foundation model by\nintegrating natural language into visual concepts, enabling zero-shot\nrecognition on downstream tasks. It is usually expected that satisfactory\noverall accuracy can be achieved across numerous domains through well-designed\ntextual prompts. However, we found that their performance in the worst\ncategories is significantly inferior to the overall performance. For example,\non ImageNet, there are a total of 10 categories with class-wise accuracy as low\nas 0\\%, even though the overall performance has achieved 64.1\\%. This\nphenomenon reveals the potential risks associated with using CLIP models,\nparticularly in risk-sensitive applications where specific categories hold\nsignificant importance. To address this issue, we investigate the alignment\nbetween the two modalities in the CLIP model and propose the Class-wise\nMatching Margin (\\cmm) to measure the inference confusion. \\cmm\\ can\neffectively identify the worst-performing categories and estimate the potential\nperformance of the candidate prompts. We further query large language models to\nenrich descriptions of worst-performing categories and build a weighted\nensemble to highlight the efficient prompts. Experimental results clearly\nverify the effectiveness of our proposal, where the accuracy on the worst-10\ncategories on ImageNet is boosted to 5.2\\%, without manual prompt engineering,\nlaborious optimization, or access to labeled validation data.\n","authors":["Jie-Jing Shao","Jiang-Xin Shi","Xiao-Wen Yang","Lan-Zhe Guo","Yu-Feng Li"],"pdf_url":"https://arxiv.org/pdf/2310.03324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15357v3","updated":"2023-10-05T05:23:07Z","published":"2023-05-24T17:09:54Z","title":"Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image\n  Super-Resolution","summary":"  Diffusion models, as a kind of powerful generative model, have given\nimpressive results on image super-resolution (SR) tasks. However, due to the\nrandomness introduced in the reverse process of diffusion models, the\nperformances of diffusion-based SR models are fluctuating at every time of\nsampling, especially for samplers with few resampled steps. This inherent\nrandomness of diffusion models results in ineffectiveness and instability,\nmaking it challenging for users to guarantee the quality of SR results.\nHowever, our work takes this randomness as an opportunity: fully analyzing and\nleveraging it leads to the construction of an effective plug-and-play sampling\nmethod that owns the potential to benefit a series of diffusion-based SR\nmethods. More in detail, we propose to steadily sample high-quality SR images\nfrom pre-trained diffusion-based SR models by solving diffusion ordinary\ndifferential equations (diffusion ODEs) with optimal boundary conditions (BCs)\nand analyze the characteristics between the choices of BCs and their\ncorresponding SR results. Our analysis shows the route to obtain an\napproximately optimal BC via an efficient exploration in the whole space. The\nquality of SR results sampled by the proposed method with fewer steps\noutperforms the quality of results sampled by current methods with randomness\nfrom the same pre-trained diffusion-based SR model, which means that our\nsampling method \"boosts\" current diffusion-based SR models without any\nadditional training.\n","authors":["Yiyang Ma","Huan Yang","Wenhan Yang","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.15357v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03314v1","updated":"2023-10-05T05:12:14Z","published":"2023-10-05T05:12:14Z","title":"Enhanced Human-Robot Collaboration using Constrained Probabilistic\n  Human-Motion Prediction","summary":"  Human motion prediction is an essential step for efficient and safe\nhuman-robot collaboration. Current methods either purely rely on representing\nthe human joints in some form of neural network-based architecture or use\nregression models offline to fit hyper-parameters in the hope of capturing a\nmodel encompassing human motion. While these methods provide good initial\nresults, they are missing out on leveraging well-studied human body kinematic\nmodels as well as body and scene constraints which can help boost the efficacy\nof these prediction frameworks while also explicitly avoiding implausible human\njoint configurations. We propose a novel human motion prediction framework that\nincorporates human joint constraints and scene constraints in a Gaussian\nProcess Regression (GPR) model to predict human motion over a set time horizon.\nThis formulation is combined with an online context-aware constraints model to\nleverage task-dependent motions. It is tested on a human arm kinematic model\nand implemented on a human-robot collaborative setup with a UR5 robot arm to\ndemonstrate the real-time capability of our approach. Simulations were also\nperformed on datasets like HA4M and ANDY. The simulation and experimental\nresults demonstrate considerable improvements in a Gaussian Process framework\nwhen these constraints are explicitly considered.\n","authors":["Aadi Kothari","Tony Tohme","Xiaotong Zhang","Kamal Youcef-Toumi"],"pdf_url":"https://arxiv.org/pdf/2310.03314v1.pdf","comment":"7 pages, 5 figures. Associated video demonstration can be found at\n  https://www.youtube.com/@MITMechatronics"},{"id":"http://arxiv.org/abs/2305.15086v2","updated":"2023-10-05T05:12:09Z","published":"2023-05-24T12:05:24Z","title":"Unpaired Image-to-Image Translation via Neural Schrödinger Bridge","summary":"  Diffusion models are a powerful class of generative models which simulate\nstochastic differential equations (SDEs) to generate data from noise. Although\ndiffusion models have achieved remarkable progress in recent years, they have\nlimitations in the unpaired image-to-image translation tasks due to the\nGaussian prior assumption. Schr\\\"odinger Bridge (SB), which learns an SDE to\ntranslate between two arbitrary distributions, have risen as an attractive\nsolution to this problem. However, none of SB models so far have been\nsuccessful at unpaired translation between high-resolution images. In this\nwork, we propose the Unpaired Neural Schr\\\"odinger Bridge (UNSB), which\nexpresses SB problem as a sequence of adversarial learning problems. This\nallows us to incorporate advanced discriminators and regularization to learn a\nSB between unpaired data. We demonstrate that UNSB is scalable and successfully\nsolves various unpaired image-to-image translation tasks. Code:\n\\url{https://github.com/cyclomon/UNSB}\n","authors":["Beomsu Kim","Gihyun Kwon","Kwanyoung Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2305.15086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02936v2","updated":"2023-10-05T04:47:52Z","published":"2023-02-06T17:11:09Z","title":"Private GANs, Revisited","summary":"  We show that the canonical approach for training differentially private GANs\n-- updating the discriminator with differentially private stochastic gradient\ndescent (DPSGD) -- can yield significantly improved results after modifications\nto training. Specifically, we propose that existing instantiations of this\napproach neglect to consider how adding noise only to discriminator updates\ninhibits discriminator training, disrupting the balance between the generator\nand discriminator necessary for successful GAN training. We show that a simple\nfix -- taking more discriminator steps between generator steps -- restores\nparity between the generator and discriminator and improves results.\n  Additionally, with the goal of restoring parity, we experiment with other\nmodifications -- namely, large batch sizes and adaptive discriminator update\nfrequency -- to improve discriminator training and see further improvements in\ngeneration quality. Our results demonstrate that on standard image synthesis\nbenchmarks, DPSGD outperforms all alternative GAN privatization schemes. Code:\nhttps://github.com/alexbie98/dpgan-revisit.\n","authors":["Alex Bie","Gautam Kamath","Guojun Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.02936v2.pdf","comment":"28 pages; revisions and new experiments from TMLR camera-ready + code\n  release at https://github.com/alexbie98/dpgan-revisit"},{"id":"http://arxiv.org/abs/2309.11745v2","updated":"2023-10-05T04:45:21Z","published":"2023-09-21T02:46:32Z","title":"PIE: Simulating Disease Progression via Progressive Image Editing","summary":"  Disease progression simulation is a crucial area of research that has\nsignificant implications for clinical diagnosis, prognosis, and treatment. One\nmajor challenge in this field is the lack of continuous medical imaging\nmonitoring of individual patients over time. To address this issue, we develop\na novel framework termed Progressive Image Editing (PIE) that enables\ncontrolled manipulation of disease-related image features, facilitating precise\nand realistic disease progression simulation. Specifically, we leverage recent\nadvancements in text-to-image generative models to simulate disease progression\naccurately and personalize it for each patient. We theoretically analyze the\niterative refining process in our framework as a gradient descent with an\nexponentially decayed learning rate. To validate our framework, we conduct\nexperiments in three medical imaging domains. Our results demonstrate the\nsuperiority of PIE over existing methods such as Stable Diffusion Walk and\nStyle-Based Manifold Extrapolation based on CLIP score (Realism) and Disease\nClassification Confidence (Alignment). Our user study collected feedback from\n35 veteran physicians to assess the generated progressions. Remarkably, 76.2%\nof the feedback agrees with the fidelity of the generated progressions. To our\nbest knowledge, PIE is the first of its kind to generate disease progression\nimages meeting real-world standards. It is a promising tool for medical\nresearch and clinical practice, potentially allowing healthcare providers to\nmodel disease trajectories over time, predict future treatment responses, and\nimprove patient outcomes.\n","authors":["Kaizhao Liang","Xu Cao","Kuei-Da Liao","Tianren Gao","Wenqian Ye","Zhengyu Chen","Jianguo Cao","Tejas Nama","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2309.11745v2.pdf","comment":"Code and checkpoints for replicating our results can be found at\n  https://github.com/IrohXu/PIE and\n  https://huggingface.co/IrohXu/stable-diffusion-mimic-cxr-v0.1"},{"id":"http://arxiv.org/abs/2306.01879v2","updated":"2023-10-05T04:12:28Z","published":"2023-06-02T19:19:43Z","title":"Revisiting the Role of Language Priors in Vision-Language Models","summary":"  Vision-language models (VLMs) are impactful in part because they can be\napplied to a variety of visual understanding tasks in a zero-shot fashion,\nwithout any fine-tuning. We study $\\textit{generative VLMs}$ that are trained\nfor next-word generation given an image. We explore their zero-shot performance\non the illustrative task of image-text retrieval across 8 popular\nvision-language benchmarks. Our first observation is that they can be\nrepurposed for discriminative tasks (such as image-text retrieval) by simply\ncomputing the match score of generating a particular text string given an\nimage. We call this probabilistic score the $\\textit{Visual Generative\nPre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces\nnear-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on\nothers. We analyze this behavior through a probabilistic lens, pointing out\nthat some benchmarks inadvertently capture unnatural language distributions by\ncreating adversarial but unlikely text captions. In fact, we demonstrate that\neven a \"blind\" language model that ignores any image evidence can sometimes\noutperform all prior art, reminiscent of similar challenges faced by the\nvisual-question answering (VQA) community many years ago. We derive a\nprobabilistic post-processing scheme that controls for the amount of linguistic\nbias in generative VLMs at test time without having to retrain or fine-tune the\nmodel. We show that the VisualGPTScore, when appropriately debiased, is a\nstrong zero-shot baseline for vision-language understanding, oftentimes\nproducing state-of-the-art accuracy.\n","authors":["Zhiqiu Lin","Xinyue Chen","Deepak Pathak","Pengchuan Zhang","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2306.01879v2.pdf","comment":"Website: https://linzhiqiu.github.io/papers/visual_gpt_score/ Code:\n  https://github.com/linzhiqiu/visual_gpt_score/"},{"id":"http://arxiv.org/abs/2110.14883v3","updated":"2023-10-05T04:09:09Z","published":"2021-10-28T04:45:55Z","title":"Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel\n  Training","summary":"  The success of Transformer models has pushed the deep learning model scale to\nbillions of parameters. Due to the limited memory resource of a single GPU,\nHowever, the best practice for choosing the optimal parallel strategy is still\nlacking, since it requires domain expertise in both deep learning and parallel\ncomputing.\n  The Colossal-AI system addressed the above challenge by introducing a unified\ninterface to scale your sequential code of model training to distributed\nenvironments. It supports parallel training methods such as data, pipeline,\ntensor, and sequence parallelism, as well as heterogeneous training methods\nintegrated with zero redundancy optimizer. Compared to the baseline system,\nColossal-AI can achieve up to 2.76 times training speedup on large-scale\nmodels.\n","authors":["Shenggui Li","Hongxin Liu","Zhengda Bian","Jiarui Fang","Haichen Huang","Yuliang Liu","Boxiang Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2110.14883v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12439v2","updated":"2023-10-05T04:08:47Z","published":"2023-08-23T21:47:06Z","title":"BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input\n  Detection","summary":"  We present a novel defense, against backdoor attacks on Deep Neural Networks\n(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors)\ninto DNNs. Our defense falls within the category of post-development defenses\nthat operate independently of how the model was generated. The proposed defense\nis built upon a novel reverse engineering approach that can directly extract\nbackdoor functionality of a given backdoored model to a backdoor expert model.\nThe approach is straightforward -- finetuning the backdoored model over a small\nset of intentionally mislabeled clean samples, such that it unlearns the normal\nfunctionality while still preserving the backdoor functionality, and thus\nresulting in a model (dubbed a backdoor expert model) that can only recognize\nbackdoor inputs. Based on the extracted backdoor expert model, we show the\nfeasibility of devising highly accurate backdoor input detectors that filter\nout the backdoor inputs during model inference. Further augmented by an\nensemble strategy with a finetuned auxiliary model, our defense, BaDExpert\n(Backdoor Input Detection with Backdoor Expert), effectively mitigates 17 SOTA\nbackdoor attacks while minimally impacting clean utility. The effectiveness of\nBaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet)\nacross various model architectures (ResNet, VGG, MobileNetV2 and Vision\nTransformer).\n","authors":["Tinghao Xie","Xiangyu Qi","Ping He","Yiming Li","Jiachen T. Wang","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2308.12439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03295v1","updated":"2023-10-05T03:51:21Z","published":"2023-10-05T03:51:21Z","title":"Can pre-trained models assist in dataset distillation?","summary":"  Dataset Distillation (DD) is a prominent technique that encapsulates\nknowledge from a large-scale original dataset into a small synthetic dataset\nfor efficient training. Meanwhile, Pre-trained Models (PTMs) function as\nknowledge repositories, containing extensive information from the original\ndataset. This naturally raises a question: Can PTMs effectively transfer\nknowledge to synthetic datasets, guiding DD accurately? To this end, we conduct\npreliminary experiments, confirming the contribution of PTMs to DD. Afterwards,\nwe systematically study different options in PTMs, including initialization\nparameters, model architecture, training epoch and domain knowledge, revealing\nthat: 1) Increasing model diversity enhances the performance of synthetic\ndatasets; 2) Sub-optimal models can also assist in DD and outperform\nwell-trained ones in certain cases; 3) Domain-specific PTMs are not mandatory\nfor DD, but a reasonable domain match is crucial. Finally, by selecting optimal\noptions, we significantly improve the cross-architecture generalization over\nbaseline DD methods. We hope our work will facilitate researchers to develop\nbetter DD techniques. Our code is available at\nhttps://github.com/yaolu-zjut/DDInterpreter.\n","authors":["Yao Lu","Xuguang Chen","Yuchen Zhang","Jianyang Gu","Tianle Zhang","Yifan Zhang","Xiaoniu Yang","Qi Xuan","Kai Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2310.03295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12214v2","updated":"2023-10-05T03:50:19Z","published":"2023-03-21T22:24:27Z","title":"Prompt-MIL: Boosting Multi-Instance Learning Schemes via Task-specific\n  Prompt Tuning","summary":"  Whole slide image (WSI) classification is a critical task in computational\npathology, requiring the processing of gigapixel-sized images, which is\nchallenging for current deep-learning methods. Current state of the art methods\nare based on multi-instance learning schemes (MIL), which usually rely on\npretrained features to represent the instances. Due to the lack of\ntask-specific annotated data, these features are either obtained from\nwell-established backbones on natural images, or, more recently from\nself-supervised models pretrained on histopathology. However, both approaches\nyield task-agnostic features, resulting in performance loss compared to the\nappropriate task-related supervision, if available. In this paper, we show that\nwhen task-specific annotations are limited, we can inject such supervision into\ndownstream task training, to reduce the gap between fully task-tuned and task\nagnostic features. We propose Prompt-MIL, an MIL framework that integrates\nprompts into WSI classification. Prompt-MIL adopts a prompt tuning mechanism,\nwhere only a small fraction of parameters calibrates the pretrained features to\nencode task-specific information, rather than the conventional full fine-tuning\napproaches. Extensive experiments on three WSI datasets, TCGA-BRCA, TCGA-CRC,\nand BRIGHT, demonstrate the superiority of Prompt-MIL over conventional MIL\nmethods, achieving a relative improvement of 1.49%-4.03% in accuracy and\n0.25%-8.97% in AUROC while using fewer than 0.3% additional parameters.\nCompared to conventional full fine-tuning approaches, we fine-tune less than\n1.3% of the parameters, yet achieve a relative improvement of 1.29%-13.61% in\naccuracy and 3.22%-27.18% in AUROC and reduce GPU memory consumption by 38%-45%\nwhile training 21%-27% faster. Our code is available at\nhttps://github.com/cvlab-stonybrook/PromptMIL.\n","authors":["Jingwei Zhang","Saarthak Kapse","Ke Ma","Prateek Prasanna","Joel Saltz","Maria Vakalopoulou","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2303.12214v2.pdf","comment":"Accepted to MICCAI 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.03291v1","updated":"2023-10-05T03:40:06Z","published":"2023-10-05T03:40:06Z","title":"SimVLG: Simple and Efficient Pretraining of Visual Language Generative\n  Models","summary":"  In this paper, we propose ``SimVLG'', a streamlined framework for the\npre-training of computationally intensive vision-language generative models,\nleveraging frozen pre-trained large language models (LLMs). The prevailing\nparadigm in vision-language pre-training (VLP) typically involves a two-stage\noptimization process: an initial resource-intensive phase dedicated to\ngeneral-purpose vision-language representation learning, aimed at extracting\nand consolidating pertinent visual features, followed by a subsequent phase\nfocusing on end-to-end alignment between visual and linguistic modalities. Our\none-stage, single-loss framework circumvents the aforementioned computationally\ndemanding first stage of training by gradually merging similar visual tokens\nduring training. This gradual merging process effectively compacts the visual\ninformation while preserving the richness of semantic content, leading to fast\nconvergence without sacrificing performance. Our experiments show that our\napproach can speed up the training of vision-language models by a factor\n$\\times 5$ without noticeable impact on the overall performance. Additionally,\nwe show that our models can achieve comparable performance to current\nvision-language models with only $1/10$ of the data. Finally, we demonstrate\nhow our image-text models can be easily adapted to video-language generative\ntasks through a novel soft attentive temporal token merging modules.\n","authors":["Yiren Jian","Tingkai Liu","Yunzhe Tao","Soroush Vosoughi","HX Yang"],"pdf_url":"https://arxiv.org/pdf/2310.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03288v1","updated":"2023-10-05T03:33:35Z","published":"2023-10-05T03:33:35Z","title":"PoseAction: Action Recognition for Patients in the Ward using Deep\n  Learning Approaches","summary":"  Real-time intelligent detection and prediction of subjects' behavior\nparticularly their movements or actions is critical in the ward. This approach\noffers the advantage of reducing in-hospital care costs and improving the\nefficiency of healthcare workers, which is especially true for scenarios at\nnight or during peak admission periods. Therefore, in this work, we propose\nusing computer vision (CV) and deep learning (DL) methods for detecting\nsubjects and recognizing their actions. We utilize OpenPose as an accurate\nsubject detector for recognizing the positions of human subjects in the video\nstream. Additionally, we employ AlphAction's Asynchronous Interaction\nAggregation (AIA) network to predict the actions of detected subjects. This\nintegrated model, referred to as PoseAction, is proposed. At the same time, the\nproposed model is further trained to predict 12 common actions in ward areas,\nsuch as staggering, chest pain, and falling down, using medical-related video\nclips from the NTU RGB+D and NTU RGB+D 120 datasets. The results demonstrate\nthat PoseAction achieves the highest classification mAP of 98.72% (IoU@0.5).\nAdditionally, this study develops an online real-time mode for action\nrecognition, which strongly supports the clinical translation of PoseAction.\nFurthermore, using OpenPose's function for recognizing face key points, we also\nimplement face blurring, which is a practical solution to address the privacy\nprotection concerns of patients and healthcare workers. Nevertheless, the\ntraining data for PoseAction is currently limited, particularly in terms of\nlabel diversity. Consequently, the subsequent step involves utilizing a more\ndiverse dataset (including general actions) to train the model's parameters for\nimproved generalization.\n","authors":["Zherui Li","Raye Chen-Hua Yeow"],"pdf_url":"https://arxiv.org/pdf/2310.03288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03279v1","updated":"2023-10-05T03:11:54Z","published":"2023-10-05T03:11:54Z","title":"Classifying Whole Slide Images: What Matters?","summary":"  Recently there have been many algorithms proposed for the classification of\nvery high resolution whole slide images (WSIs). These new algorithms are mostly\nfocused on finding novel ways to combine the information from small local\npatches extracted from the slide, with an emphasis on effectively aggregating\nmore global information for the final predictor. In this paper we thoroughly\nexplore different key design choices for WSI classification algorithms to\ninvestigate what matters most for achieving high accuracy. Surprisingly, we\nfound that capturing global context information does not necessarily mean\nbetter performance. A model that captures the most global information\nconsistently performs worse than a model that captures less global information.\nIn addition, a very simple multi-instance learning method that captures no\nglobal information performs almost as well as models that capture a lot of\nglobal information. These results suggest that the most important features for\neffective WSI classification are captured at the local small patch level, where\ncell and tissue micro-environment detail is most pronounced. Another surprising\nfinding was that unsupervised pre-training on a larger set of 33 cancers gives\nsignificantly worse performance compared to pre-training on a smaller dataset\nof 7 cancers (including the target cancer). We posit that pre-training on a\nsmaller, more focused dataset allows the feature extractor to make better use\nof the limited feature space to better discriminate between subtle differences\nin the input patch.\n","authors":["Long Nguyen","Aiden Nibali","Joshua Millward","Zhen He"],"pdf_url":"https://arxiv.org/pdf/2310.03279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03273v1","updated":"2023-10-05T02:59:48Z","published":"2023-10-05T02:59:48Z","title":"Ablation Study to Clarify the Mechanism of Object Segmentation in\n  Multi-Object Representation Learning","summary":"  Multi-object representation learning aims to represent complex real-world\nvisual input using the composition of multiple objects. Representation learning\nmethods have often used unsupervised learning to segment an input image into\nindividual objects and encode these objects into each latent vector. However,\nit is not clear how previous methods have achieved the appropriate segmentation\nof individual objects. Additionally, most of the previous methods regularize\nthe latent vectors using a Variational Autoencoder (VAE). Therefore, it is not\nclear whether VAE regularization contributes to appropriate object\nsegmentation. To elucidate the mechanism of object segmentation in multi-object\nrepresentation learning, we conducted an ablation study on MONet, which is a\ntypical method. MONet represents multiple objects using pairs that consist of\nan attention mask and the latent vector corresponding to the attention mask.\nEach latent vector is encoded from the input image and attention mask. Then,\nthe component image and attention mask are decoded from each latent vector. The\nloss function of MONet consists of 1) the sum of reconstruction losses between\nthe input image and decoded component image, 2) the VAE regularization loss of\nthe latent vector, and 3) the reconstruction loss of the attention mask to\nexplicitly encode shape information. We conducted an ablation study on these\nthree loss functions to investigate the effect on segmentation performance. Our\nresults showed that the VAE regularization loss did not affect segmentation\nperformance and the others losses did affect it. Based on this result, we\nhypothesize that it is important to maximize the attention mask of the image\nregion best represented by a single latent vector corresponding to the\nattention mask. We confirmed this hypothesis by evaluating a new loss function\nwith the same mechanism as the hypothesis.\n","authors":["Takayuki Komatsu","Yoshiyuki Ohmura","Yasuo Kuniyoshi"],"pdf_url":"https://arxiv.org/pdf/2310.03273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03270v1","updated":"2023-10-05T02:51:53Z","published":"2023-10-05T02:51:53Z","title":"EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit\n  Diffusion Models","summary":"  Diffusion models have demonstrated remarkable capabilities in image synthesis\nand related generative tasks. Nevertheless, their practicality for low-latency\nreal-world applications is constrained by substantial computational costs and\nlatency issues. Quantization is a dominant way to compress and accelerate\ndiffusion models, where post-training quantization (PTQ) and quantization-aware\ntraining (QAT) are two main approaches, each bearing its own properties. While\nPTQ exhibits efficiency in terms of both time and data usage, it may lead to\ndiminished performance in low bit-width. On the other hand, QAT can alleviate\nperformance degradation but comes with substantial demands on computational and\ndata resources. To capitalize on the advantages while avoiding their respective\ndrawbacks, we introduce a data-free and parameter-efficient fine-tuning\nframework for low-bit diffusion models, dubbed EfficientDM, to achieve\nQAT-level performance with PTQ-like efficiency. Specifically, we propose a\nquantization-aware variant of the low-rank adapter (QALoRA) that can be merged\nwith model weights and jointly quantized to low bit-width. The fine-tuning\nprocess distills the denoising capabilities of the full-precision model into\nits quantized counterpart, eliminating the requirement for training data. We\nalso introduce scale-aware optimization and employ temporal learned step-size\nquantization to further enhance performance. Extensive experimental results\ndemonstrate that our method significantly outperforms previous PTQ-based\ndiffusion models while maintaining similar time and data efficiency.\nSpecifically, there is only a marginal 0.05 sFID increase when quantizing both\nweights and activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to\nQAT-based methods, our EfficientDM also boasts a 16.2x faster quantization\nspeed with comparable generation quality.\n","authors":["Yefei He","Jing Liu","Weijia Wu","Hong Zhou","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.03270v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2310.02676v2","updated":"2023-10-05T02:49:36Z","published":"2023-10-04T09:27:39Z","title":"PostRainBench: A comprehensive benchmark and a new model for\n  precipitation forecasting","summary":"  Accurate precipitation forecasting is a vital challenge of both scientific\nand societal importance. Data-driven approaches have emerged as a widely used\nsolution for addressing this challenge. However, solely relying on data-driven\napproaches has limitations in modeling the underlying physics, making accurate\npredictions difficult. Coupling AI-based post-processing techniques with\ntraditional Numerical Weather Prediction (NWP) methods offers a more effective\nsolution for improving forecasting accuracy. Despite previous post-processing\nefforts, accurately predicting heavy rainfall remains challenging due to the\nimbalanced precipitation data across locations and complex relationships\nbetween multiple meteorological variables. To address these limitations, we\nintroduce the PostRainBench, a comprehensive multi-variable NWP post-processing\nbenchmark consisting of three datasets for NWP post-processing-based\nprecipitation forecasting. We propose CAMT, a simple yet effective Channel\nAttention Enhanced Multi-task Learning framework with a specially designed\nweighted loss function. Its flexible design allows for easy plug-and-play\nintegration with various backbones. Extensive experimental results on the\nproposed benchmark show that our method outperforms state-of-the-art methods by\n6.3%, 4.7%, and 26.8% in rain CSI on the three datasets respectively. Most\nnotably, our model is the first deep learning-based method to outperform\ntraditional Numerical Weather Prediction (NWP) approaches in extreme\nprecipitation conditions. It shows improvements of 15.6%, 17.4%, and 31.8% over\nNWP predictions in heavy rain CSI on respective datasets. These results\nhighlight the potential impact of our model in reducing the severe consequences\nof extreme weather events.\n","authors":["Yujin Tang","Jiaming Zhou","Xiang Pan","Zeying Gong","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2310.02676v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.16738v2","updated":"2023-10-05T02:06:24Z","published":"2023-08-31T13:54:57Z","title":"SFUSNet: A Spatial-Frequency domain-based Multi-branch Network for\n  diagnosis of Cervical Lymph Node Lesions in Ultrasound Images","summary":"  Booming deep learning has substantially improved the diagnosis for diverse\nlesions in ultrasound images, but a conspicuous research gap concerning\ncervical lymph node lesions still remains. The objective of this work is to\ndiagnose cervical lymph node lesions in ultrasound images by leveraging a deep\nlearning model. To this end, we first collected 3392 cervical ultrasound images\ncontaining normal lymph nodes, benign lymph node lesions, malignant primary\nlymph node lesions, and malignant metastatic lymph node lesions. Given that\nultrasound images are generated by the reflection and scattering of sound waves\nacross varied bodily tissues, we proposed the Conv-FFT Block. It integrates\nconvolutional operations with the fast Fourier transform to more astutely model\nthe images. Building upon this foundation, we designed a novel architecture,\nnamed SFUSNet. SFUSNet not only discerns variances in ultrasound images from\nthe spatial domain but also adeptly captures micro-structural alterations\nacross various lesions in the frequency domain. To ascertain the potential of\nSFUSNet, we benchmarked it against 12 popular architectures through five-fold\ncross-validation. The results show that SFUSNet is the state-of-the-art model\nand can achieve 92.89% accuracy. Moreover, its average precision, average\nsensitivity and average specificity for four types of lesions achieve 90.46%,\n89.95% and 97.49%, respectively.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Bingchun Luo","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09472v3","updated":"2023-10-05T01:45:37Z","published":"2023-09-18T04:10:27Z","title":"Reconstructing Existing Levels through Level Inpainting","summary":"  Procedural Content Generation (PCG) and Procedural Content Generation via\nMachine Learning (PCGML) have been used in prior work for generating levels in\nvarious games. This paper introduces Content Augmentation and focuses on the\nsubproblem of level inpainting, which involves reconstructing and extending\nvideo game levels. Drawing inspiration from image inpainting, we adapt two\ntechniques from this domain to address our specific use case. We present two\napproaches for level inpainting: an Autoencoder and a U-net. Through a\ncomprehensive case study, we demonstrate their superior performance compared to\na baseline method and discuss their relative merits. Furthermore, we provide a\npractical demonstration of both approaches for the level inpainting task and\noffer insights into potential directions for future research.\n","authors":["Johor Jara Gonzalez","Matthew Guzdial"],"pdf_url":"https://arxiv.org/pdf/2309.09472v3.pdf","comment":"8 pages, 5 figures, Artificial Intelligence and Interactive Digital\n  Entertainment"},{"id":"http://arxiv.org/abs/2205.03519v3","updated":"2023-10-05T00:15:00Z","published":"2022-05-07T01:49:31Z","title":"Self-supervised Deep Unrolled Reconstruction Using Regularization by\n  Denoising","summary":"  Deep learning methods have been successfully used in various computer vision\ntasks. Inspired by that success, deep learning has been explored in magnetic\nresonance imaging (MRI) reconstruction. In particular, integrating deep\nlearning and model-based optimization methods has shown considerable\nadvantages. However, a large amount of labeled training data is typically\nneeded for high reconstruction quality, which is challenging for some MRI\napplications. In this paper, we propose a novel reconstruction method, named\nDURED-Net, that enables interpretable self-supervised learning for MR image\nreconstruction by combining a self-supervised denoising network and a\nplug-and-play method. We aim to boost the reconstruction performance of\nNoise2Noise in MR reconstruction by adding an explicit prior that utilizes\nimaging physics. Specifically, the leverage of a denoising network for MRI\nreconstruction is achieved using Regularization by Denoising (RED). Experiment\nresults demonstrate that the proposed method requires a reduced amount of\ntraining data to achieve high reconstruction quality among the state-of-art of\nMR reconstruction utilizing the Noise2Noise method.\n","authors":["Peizhou Huang","Chaoyi Zhang","Xiaoliang Zhang","Xiaojuan Li","Liang Dong","Leslie Ying"],"pdf_url":"https://arxiv.org/pdf/2205.03519v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.03714v1","updated":"2023-10-05T17:37:25Z","published":"2023-10-05T17:37:25Z","title":"DSPy: Compiling Declarative Language Model Calls into Self-Improving\n  Pipelines","summary":"  The ML community is rapidly exploring techniques for prompting language\nmodels (LMs) and for stacking them into pipelines that solve complex tasks.\nUnfortunately, existing LM pipelines are typically implemented using hard-coded\n\"prompt templates\", i.e. lengthy strings discovered via trial and error. Toward\na more systematic approach for developing and optimizing LM pipelines, we\nintroduce DSPy, a programming model that abstracts LM pipelines as text\ntransformation graphs, i.e. imperative computational graphs where LMs are\ninvoked through declarative modules. DSPy modules are parameterized, meaning\nthey can learn (by creating and collecting demonstrations) how to apply\ncompositions of prompting, finetuning, augmentation, and reasoning techniques.\nWe design a compiler that will optimize any DSPy pipeline to maximize a given\nmetric. We conduct two case studies, showing that succinct DSPy programs can\nexpress and optimize sophisticated LM pipelines that reason about math word\nproblems, tackle multi-hop retrieval, answer complex questions, and control\nagent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and\nllama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot\nprompting (generally by over 25% and 65%, respectively) and pipelines with\nexpert-created demonstrations (by up to 5-46% and 16-40%, respectively). On top\nof that, DSPy programs compiled to open and relatively small LMs like\n770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely\non expert-written prompt chains for proprietary GPT-3.5. DSPy is available at\nhttps://github.com/stanfordnlp/dspy\n","authors":["Omar Khattab","Arnav Singhvi","Paridhi Maheshwari","Zhiyuan Zhang","Keshav Santhanam","Sri Vardhamanan","Saiful Haq","Ashutosh Sharma","Thomas T. Joshi","Hanna Moazam","Heather Miller","Matei Zaharia","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2310.03714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11686v2","updated":"2023-10-05T17:03:06Z","published":"2023-09-20T23:40:32Z","title":"SE-PEF: a Resource for Personalized Expert Finding","summary":"  The problem of personalization in Information Retrieval has been under study\nfor a long time. A well-known issue related to this task is the lack of\npublicly available datasets that can support a comparative evaluation of\npersonalized search systems. To contribute in this respect, this paper\nintroduces SE-PEF (StackExchange - Personalized Expert Finding), a resource\nuseful for designing and evaluating personalized models related to the task of\nExpert Finding (EF). The contributed dataset includes more than 250k queries\nand 565k answers from 3 306 experts, which are annotated with a rich set of\nfeatures modeling the social interactions among the users of a popular cQA\nplatform. The results of the preliminary experiments conducted show the\nappropriateness of SE-PEF to evaluate and to train effective EF models.\n","authors":["Pranav Kasela","Gabriella Pasi","Raffaele Perego"],"pdf_url":"https://arxiv.org/pdf/2309.11686v2.pdf","comment":"SIGIR-AP '23 Conference paper"},{"id":"http://arxiv.org/abs/2309.08621v2","updated":"2023-10-05T16:07:59Z","published":"2023-09-10T17:47:21Z","title":"Exploring Social Choice Mechanisms for Recommendation Fairness in SCRUF","summary":"  Fairness problems in recommender systems often have a complexity in practice\nthat is not adequately captured in simplified research formulations. A social\nchoice formulation of the fairness problem, operating within a multi-agent\narchitecture of fairness concerns, offers a flexible and multi-aspect\nalternative to fairness-aware recommendation approaches. Leveraging social\nchoice allows for increased generality and the possibility of tapping into\nwell-studied social choice algorithms for resolving the tension between\nmultiple, competing fairness concerns. This paper explores a range of options\nfor choice mechanisms in multi-aspect fairness applications using both real and\nsynthetic data and shows that different classes of choice and allocation\nmechanisms yield different but consistent fairness / accuracy tradeoffs. We\nalso show that a multi-agent formulation offers flexibility in adapting to user\npopulation dynamics.\n","authors":["Amanda Aird","Cassidy All","Paresha Farastu","Elena Stefancova","Joshua Sun","Nicholas Mattei","Robin Burke"],"pdf_url":"https://arxiv.org/pdf/2309.08621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03605v1","updated":"2023-10-05T15:36:35Z","published":"2023-10-05T15:36:35Z","title":"FASER: Binary Code Similarity Search through the use of Intermediate\n  Representations","summary":"  Being able to identify functions of interest in cross-architecture software\nis useful whether you are analysing for malware, securing the software supply\nchain or conducting vulnerability research. Cross-Architecture Binary Code\nSimilarity Search has been explored in numerous studies and has used a wide\nrange of different data sources to achieve its goals. The data sources\ntypically used draw on common structures derived from binaries such as function\ncontrol flow graphs or binary level call graphs, the output of the disassembly\nprocess or the outputs of a dynamic analysis approach. One data source which\nhas received less attention is binary intermediate representations. Binary\nIntermediate representations possess two interesting properties: they are cross\narchitecture by their very nature and encode the semantics of a function\nexplicitly to support downstream usage. Within this paper we propose Function\nas a String Encoded Representation (FASER) which combines long document\ntransformers with the use of intermediate representations to create a model\ncapable of cross architecture function search without the need for manual\nfeature engineering, pre-training or a dynamic analysis step. We compare our\napproach against a series of baseline approaches for two tasks; A general\nfunction search task and a targeted vulnerability search task. Our approach\ndemonstrates strong performance across both tasks, performing better than all\nbaseline approaches.\n","authors":["Josh Collyer","Tim Watson","Iain Phillips"],"pdf_url":"https://arxiv.org/pdf/2310.03605v1.pdf","comment":"10 pages, To be presented as Conference on Applied Machine Learning\n  for Information Security"},{"id":"http://arxiv.org/abs/2310.03491v1","updated":"2023-10-05T12:02:51Z","published":"2023-10-05T12:02:51Z","title":"TPDR: A Novel Two-Step Transformer-based Product and Class Description\n  Match and Retrieval Method","summary":"  There is a niche of companies responsible for intermediating the purchase of\nlarge batches of varied products for other companies, for which the main\nchallenge is to perform product description standardization, i.e., matching an\nitem described by a client with a product described in a catalog. The problem\nis complex since the client's product description may be: (1) potentially\nnoisy; (2) short and uninformative (e.g., missing information about model and\nsize); and (3) cross-language. In this paper, we formalize this problem as a\nranking task: given an initial client product specification (query), return the\nmost appropriate standardized descriptions (response). In this paper, we\npropose TPDR, a two-step Transformer-based Product and Class Description\nRetrieval method that is able to explore the semantic correspondence between IS\nand SD, by exploiting attention mechanisms and contrastive learning. First,\nTPDR employs the transformers as two encoders sharing the embedding vector\nspace: one for encoding the IS and another for the SD, in which corresponding\npairs (IS, SD) must be close in the vector space. Closeness is further enforced\nby a contrastive learning mechanism leveraging a specialized loss function.\nTPDR also exploits a (second) re-ranking step based on syntactic features that\nare very important for the exact matching (model, dimension) of certain\nproducts that may have been neglected by the transformers. To evaluate our\nproposal, we consider 11 datasets from a real company, covering different\napplication contexts. Our solution was able to retrieve the correct\nstandardized product before the 5th ranking position in 71% of the cases and\nits correct category in the first position in 80% of the situations. Moreover,\nthe effectiveness gains over purely syntactic or semantic baselines reach up to\n3.7 times, solving cases that none of the approaches in isolation can do by\nthemselves.\n","authors":["Washington Cunha","Celso França","Leonardo Rocha","Marcos André Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2310.03491v1.pdf","comment":"10 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2310.03481v1","updated":"2023-10-05T11:46:39Z","published":"2023-10-05T11:46:39Z","title":"Personalized Transformer-based Ranking for e-Commerce at Yandex","summary":"  Personalizing the user experience with high-quality recommendations based on\nuser activities is vital for e-commerce platforms. This is particularly\nimportant in scenarios where the user's intent is not explicit, such as on the\nhomepage. Recently, personalized embedding-based systems have significantly\nimproved the quality of recommendations and search results in the e-commerce\ndomain. However, most of these works focus on enhancing the retrieval stage.\n  In this paper, we demonstrate that features produced by retrieval-focused\ndeep learning models are sub-optimal for ranking stage in e-commerce\nrecommendations. To address this issue, we propose a two-stage training process\nthat fine-tunes two-tower models to achieve optimal ranking performance. We\nprovide a detailed description of our transformer-based two-tower model\narchitecture, which is specifically designed for personalization in e-commerce.\n  Additionally, we introduce a novel technique for debiasing context in offline\nmodels and report significant improvements in ranking performance when using\nweb-search queries for e-commerce recommendations. Our model has been\nsuccessfully deployed at Yandex and has delivered strong performance in online\nA/B testing.\n","authors":["Kirill Khrylchenko","Alexander Fritzler"],"pdf_url":"https://arxiv.org/pdf/2310.03481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.05641v3","updated":"2023-10-05T08:28:44Z","published":"2021-08-12T10:12:48Z","title":"SR-HetGNN:Session-based Recommendation with Heterogeneous Graph Neural\n  Network","summary":"  The Session-Based Recommendation System aims to predict the user's next click\nbased on their previous session sequence. The current studies generally learn\nuser preferences according to the transitions of items in the user's session\nsequence. However, other effective information in the session sequence, such as\nuser profiles, are largely ignored which may lead to the model unable to learn\nthe user's specific preferences. In this paper, we propose SR-HetGNN, a novel\nsession recommendation method that uses a heterogeneous graph neural network\n(HetGNN) to learn session embeddings and capture the specific preferences of\nanonymous users. Specifically, SR-HetGNN first constructs heterogeneous graphs\ncontaining various types of nodes according to the session sequence, which can\ncapture the dependencies among items, users, and sessions. Second, HetGNN\ncaptures the complex transitions between items and learns the item embeddings\ncontaining user information. Finally, local and global session embeddings are\ncombined with the attentional network to obtain the final session embedding,\nconsidering the influence of users' long and short-term preferences. SR-HetGNN\nis shown to be superior to the existing state-of-the-art session-based\nrecommendation methods through extensive experiments over two real large\ndatasets Diginetica and Tmall.\n","authors":["Jinpeng Chen","Haiyang Li","Xudong Zhang","Fan Zhang","Senzhang Wang","Kaimin Wei","Jiaqi Ji"],"pdf_url":"https://arxiv.org/pdf/2108.05641v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05917v3","updated":"2023-10-05T02:33:49Z","published":"2022-09-13T12:06:01Z","title":"SpaDE: Improving Sparse Representations using a Dual Document Encoder\n  for First-stage Retrieval","summary":"  Sparse document representations have been widely used to retrieve relevant\ndocuments via exact lexical matching. Owing to the pre-computed inverted index,\nit supports fast ad-hoc search but incurs the vocabulary mismatch problem.\nAlthough recent neural ranking models using pre-trained language models can\naddress this problem, they usually require expensive query inference costs,\nimplying the trade-off between effectiveness and efficiency. Tackling the\ntrade-off, we propose a novel uni-encoder ranking model, Sparse retriever using\na Dual document Encoder (SpaDE), learning document representation via the dual\nencoder. Each encoder plays a central role in (i) adjusting the importance of\nterms to improve lexical matching and (ii) expanding additional terms to\nsupport semantic matching. Furthermore, our co-training strategy trains the\ndual encoder effectively and avoids unnecessary intervention in training each\nother. Experimental results on several benchmarks show that SpaDE outperforms\nexisting uni-encoder ranking models.\n","authors":["Eunseong Choi","Sunkyung Lee","Minjin Choi","Hyeseon Ko","Young-In Song","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2209.05917v3.pdf","comment":"In Proceedings of the 31st ACM International Conference on\n  Information and Knowledge Management (CIKM '22). 13 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.03744v1","updated":"2023-10-05T17:59:56Z","published":"2023-10-05T17:59:56Z","title":"Improved Baselines with Visual Instruction Tuning","summary":"  Large multimodal models (LMM) have recently shown encouraging progress with\nvisual instruction tuning. In this note, we show that the fully-connected\nvision-language cross-modal connector in LLaVA is surprisingly powerful and\ndata-efficient. With simple modifications to LLaVA, namely, using\nCLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA\ndata with simple response formatting prompts, we establish stronger baselines\nthat achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint\nuses merely 1.2M publicly available data, and finishes full training in ~1 day\non a single 8-A100 node. We hope this can make state-of-the-art LMM research\nmore accessible. Code and model will be publicly available.\n","authors":["Haotian Liu","Chunyuan Li","Yuheng Li","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2310.03744v1.pdf","comment":"Tech report, 4 pages. LLaVA project page: https://llava-vl.github.io"},{"id":"http://arxiv.org/abs/2310.03743v1","updated":"2023-10-05T17:59:55Z","published":"2023-10-05T17:59:55Z","title":"The Un-Kidnappable Robot: Acoustic Localization of Sneaking People","summary":"  How easy is it to sneak up on a robot? We examine whether we can detect\npeople using only the incidental sounds they produce as they move, even when\nthey try to be quiet. We collect a robotic dataset of high-quality 4-channel\naudio paired with 360 degree RGB data of people moving in different indoor\nsettings. We train models that predict if there is a moving person nearby and\ntheir location using only audio. We implement our method on a robot, allowing\nit to track a single person moving quietly with only passive audio sensing. For\ndemonstration videos, see our project page:\nhttps://sites.google.com/view/unkidnappable-robot\n","authors":["Mengyu Yang","Patrick Grady","Samarth Brahmbhatt","Arun Balajee Vasudevan","Charles C. Kemp","James Hays"],"pdf_url":"https://arxiv.org/pdf/2310.03743v1.pdf","comment":"Project page: https://sites.google.com/view/unkidnappable-robot"},{"id":"http://arxiv.org/abs/2310.03740v1","updated":"2023-10-05T17:59:45Z","published":"2023-10-05T17:59:45Z","title":"ContactGen: Generative Contact Modeling for Grasp Generation","summary":"  This paper presents a novel object-centric contact representation ContactGen\nfor hand-object interaction. The ContactGen comprises three components: a\ncontact map indicates the contact location, a part map represents the contact\nhand part, and a direction map tells the contact direction within each part.\nGiven an input object, we propose a conditional generative model to predict\nContactGen and adopt model-based optimization to predict diverse and\ngeometrically feasible grasps. Experimental results demonstrate our method can\ngenerate high-fidelity and diverse human grasps for various objects. Project\npage: https://stevenlsw.github.io/contactgen/\n","authors":["Shaowei Liu","Yang Zhou","Jimei Yang","Saurabh Gupta","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03740v1.pdf","comment":"Accepted to ICCV 2023. Website:\n  https://stevenlsw.github.io/contactgen/"},{"id":"http://arxiv.org/abs/2310.03739v1","updated":"2023-10-05T17:59:18Z","published":"2023-10-05T17:59:18Z","title":"Aligning Text-to-Image Diffusion Models with Reward Backpropagation","summary":"  Text-to-image diffusion models have recently emerged at the forefront of\nimage generation, powered by very large-scale unsupervised or weakly supervised\ntext-to-image training datasets. Due to their unsupervised training,\ncontrolling their behavior in downstream tasks, such as maximizing\nhuman-perceived image quality, image-text alignment, or ethical image\ngeneration, is difficult. Recent works finetune diffusion models to downstream\nreward functions using vanilla reinforcement learning, notorious for the high\nvariance of the gradient estimators. In this paper, we propose AlignProp, a\nmethod that aligns diffusion models to downstream reward functions using\nend-to-end backpropagation of the reward gradient through the denoising\nprocess. While naive implementation of such backpropagation would require\nprohibitive memory resources for storing the partial derivatives of modern\ntext-to-image models, AlignProp finetunes low-rank adapter weight modules and\nuses gradient checkpointing, to render its memory usage viable. We test\nAlignProp in finetuning diffusion models to various objectives, such as\nimage-text semantic alignment, aesthetics, compressibility and controllability\nof the number of objects present, as well as their combinations. We show\nAlignProp achieves higher rewards in fewer training steps than alternatives,\nwhile being conceptually simpler, making it a straightforward choice for\noptimizing diffusion models for differentiable reward functions of interest.\nCode and Visualization results are available at https://align-prop.github.io/.\n","authors":["Mihir Prabhudesai","Anirudh Goyal","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2310.03739v1.pdf","comment":"Code available at https://align-prop.github.io/"},{"id":"http://arxiv.org/abs/2212.02648v2","updated":"2023-10-05T17:59:06Z","published":"2022-12-05T23:15:43Z","title":"Spuriosity Rankings: Sorting Data to Measure and Mitigate Biases","summary":"  We present a simple but effective method to measure and mitigate model biases\ncaused by reliance on spurious cues. Instead of requiring costly changes to\none's data or model training, our method better utilizes the data one already\nhas by sorting them. Specifically, we rank images within their classes based on\nspuriosity (the degree to which common spurious cues are present), proxied via\ndeep neural features of an interpretable network. With spuriosity rankings, it\nis easy to identify minority subpopulations (i.e. low spuriosity images) and\nassess model bias as the gap in accuracy between high and low spuriosity\nimages. One can even efficiently remove a model's bias at little cost to\naccuracy by finetuning its classification head on low spuriosity images,\nresulting in fairer treatment of samples regardless of spuriosity. We\ndemonstrate our method on ImageNet, annotating $5000$ class-feature\ndependencies ($630$ of which we find to be spurious) and generating a dataset\nof $325k$ soft segmentations for these features along the way. Having computed\nspuriosity rankings via the identified spurious neural features, we assess\nbiases for $89$ diverse models and find that class-wise biases are highly\ncorrelated across models. Our results suggest that model bias due to spurious\nfeature reliance is influenced far more by what the model is trained on than\nhow it is trained.\n","authors":["Mazda Moayeri","Wenxiao Wang","Sahil Singla","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2212.02648v2.pdf","comment":"Accepted to NeurIPS '23 (Spotlight)"},{"id":"http://arxiv.org/abs/2310.02995v2","updated":"2023-10-05T17:58:37Z","published":"2023-10-04T17:30:50Z","title":"IBCL: Zero-shot Model Generation for Task Trade-offs in Continual\n  Learning","summary":"  Like generic multi-task learning, continual learning has the nature of\nmulti-objective optimization, and therefore faces a trade-off between the\nperformance of different tasks. That is, to optimize for the current task\ndistribution, it may need to compromise performance on some previous tasks.\nThis means that there exist multiple models that are Pareto-optimal at\ndifferent times, each addressing a distinct task performance trade-off.\nResearchers have discussed how to train particular models to address specific\ntrade-off preferences. However, existing algorithms require training overheads\nproportional to the number of preferences -- a large burden when there are\nmultiple, possibly infinitely many, preferences. As a response, we propose\nImprecise Bayesian Continual Learning (IBCL). Upon a new task, IBCL (1) updates\na knowledge base in the form of a convex hull of model parameter distributions\nand (2) obtains particular models to address task trade-off preferences with\nzero-shot. That is, IBCL does not require any additional training overhead to\ngenerate preference-addressing models from its knowledge base. We show that\nmodels obtained by IBCL have guarantees in identifying the Pareto optimal\nparameters. Moreover, experiments on standard image classification and NLP\ntasks support this guarantee. Statistically, IBCL improves average per-task\naccuracy by at most 23\\% and peak per-task accuracy by at most 15\\% with\nrespect to the baseline methods, with steadily near-zero or positive backward\ntransfer. Most importantly, IBCL significantly reduces the training overhead\nfrom training 1 model per preference to at most 3 models for all preferences.\n","authors":["Pengyuan Lu","Michele Caprio","Eric Eaton","Insup Lee"],"pdf_url":"https://arxiv.org/pdf/2310.02995v2.pdf","comment":"Duplicate submission to arxiv"},{"id":"http://arxiv.org/abs/2310.03738v1","updated":"2023-10-05T17:58:32Z","published":"2023-10-05T17:58:32Z","title":"Stylist: Style-Driven Feature Ranking for Robust Novelty Detection","summary":"  Novelty detection aims at finding samples that differ in some form from the\ndistribution of seen samples. But not all changes are created equal. Data can\nsuffer a multitude of distribution shifts, and we might want to detect only\nsome types of relevant changes. Similar to works in out-of-distribution\ngeneralization, we propose to use the formalization of separating into semantic\nor content changes, that are relevant to our task, and style changes, that are\nirrelevant. Within this formalization, we define the robust novelty detection\nas the task of finding semantic changes while being robust to style\ndistributional shifts. Leveraging pretrained, large-scale model\nrepresentations, we introduce Stylist, a novel method that focuses on dropping\nenvironment-biased features. First, we compute a per-feature score based on the\nfeature distribution distances between environments. Next, we show that our\nselection manages to remove features responsible for spurious correlations and\nimprove novelty detection performance. For evaluation, we adapt domain\ngeneralization datasets to our task and analyze the methods behaviors. We\nadditionally built a large synthetic dataset where we have control over the\nspurious correlations degree. We prove that our selection mechanism improves\nnovelty detection algorithms across multiple datasets, containing both\nstylistic and content shifts.\n","authors":["Stefan Smeu","Elena Burceanu","Emanuela Haller","Andrei Liviu Nicolicioiu"],"pdf_url":"https://arxiv.org/pdf/2310.03738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03731v1","updated":"2023-10-05T17:52:09Z","published":"2023-10-05T17:52:09Z","title":"MathCoder: Seamless Code Integration in LLMs for Enhanced Mathematical\n  Reasoning","summary":"  The recently released GPT-4 Code Interpreter has demonstrated remarkable\nproficiency in solving challenging math problems, primarily attributed to its\nability to seamlessly reason with natural language, generate code, execute\ncode, and continue reasoning based on the execution output. In this paper, we\npresent a method to fine-tune open-source language models, enabling them to use\ncode for modeling and deriving math equations and, consequently, enhancing\ntheir mathematical reasoning abilities. We propose a method of generating novel\nand high-quality datasets with math problems and their code-based solutions,\nreferred to as MathCodeInstruct. Each solution interleaves natural language,\ncode, and execution results. We also introduce a customized supervised\nfine-tuning and inference approach. This approach yields the MathCoder models,\na family of models capable of generating code-based solutions for solving\nchallenging math problems. Impressively, the MathCoder models achieve\nstate-of-the-art scores among open-source LLMs on the MATH (45.2%) and GSM8K\n(83.9%) datasets, substantially outperforming other open-source alternatives.\nNotably, the MathCoder model not only surpasses ChatGPT-3.5 and PaLM-2 on GSM8K\nand MATH but also outperforms GPT-4 on the competition-level MATH dataset. The\ndataset and models will be released at https://github.com/mathllm/MathCoder.\n","authors":["Ke Wang","Houxing Ren","Aojun Zhou","Zimu Lu","Sichun Luo","Weikang Shi","Renrui Zhang","Linqi Song","Mingjie Zhan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2310.03731v1.pdf","comment":"The state-of-the-art open-source language models for mathematical\n  reasoning"},{"id":"http://arxiv.org/abs/2310.03725v1","updated":"2023-10-05T17:46:31Z","published":"2023-10-05T17:46:31Z","title":"Stochastic interpolants with data-dependent couplings","summary":"  Generative models inspired by dynamical transport of measure -- such as flows\nand diffusions -- construct a continuous-time map between two probability\ndensities. Conventionally, one of these is the target density, only accessible\nthrough samples, while the other is taken as a simple base density that is\ndata-agnostic. In this work, using the framework of stochastic interpolants, we\nformalize how to \\textit{couple} the base and the target densities. This\nenables us to incorporate information about class labels or continuous\nembeddings to construct dynamical transport maps that serve as conditional\ngenerative models. We show that these transport maps can be learned by solving\na simple square loss regression problem analogous to the standard independent\nsetting. We demonstrate the usefulness of constructing dependent couplings in\npractice through experiments in super-resolution and in-painting.\n","authors":["Michael S. Albergo","Mark Goldstein","Nicholas M. Boffi","Rajesh Ranganath","Eric Vanden-Eijnden"],"pdf_url":"https://arxiv.org/pdf/2310.03725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03722v1","updated":"2023-10-05T17:43:26Z","published":"2023-10-05T17:43:26Z","title":"Anytime-valid t-tests and confidence sequences for Gaussian means with\n  unknown variance","summary":"  In 1976, Lai constructed a nontrivial confidence sequence for the mean $\\mu$\nof a Gaussian distribution with unknown variance $\\sigma$. Curiously, he\nemployed both an improper (right Haar) mixture over $\\sigma$ and an improper\n(flat) mixture over $\\mu$. Here, we elaborate carefully on the details of his\nconstruction, which use generalized nonintegrable martingales and an extended\nVille's inequality. While this does yield a sequential t-test, it does not\nyield an ``e-process'' (due to the nonintegrability of his martingale). In this\npaper, we develop two new e-processes and confidence sequences for the same\nsetting: one is a test martingale in a reduced filtration, while the other is\nan e-process in the canonical data filtration. These are respectively obtained\nby swapping Lai's flat mixture for a Gaussian mixture, and swapping the right\nHaar mixture over $\\sigma$ with the maximum likelihood estimate under the null,\nas done in universal inference. We also analyze the width of resulting\nconfidence sequences, which have a curious dependence on the error probability\n$\\alpha$. Numerical experiments are provided along the way to compare and\ncontrast the various approaches.\n","authors":["Hongjian Wang","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2310.03722v1.pdf","comment":"26 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.20057v3","updated":"2023-10-05T17:41:06Z","published":"2023-05-31T17:31:56Z","title":"Three-Way Trade-Off in Multi-Objective Learning: Optimization,\n  Generalization and Conflict-Avoidance","summary":"  Multi-objective learning (MOL) problems often arise in emerging machine\nlearning problems when there are multiple learning criteria, data modalities,\nor learning tasks. Different from single-objective learning, one of the\ncritical challenges in MOL is the potential conflict among different objectives\nduring the iterative optimization process. Recent works have developed various\ndynamic weighting algorithms for MOL such as MGDA and its variants, where the\ncentral idea is to find an update direction that avoids conflicts among\nobjectives. Albeit its appealing intuition, empirical studies show that dynamic\nweighting methods may not always outperform static ones. To understand this\ntheory-practical gap, we focus on a new stochastic variant of MGDA - the\nMulti-objective gradient with Double sampling (MoDo) algorithm, and study the\ngeneralization performance of the dynamic weighting-based MoDo and its\ninterplay with optimization through the lens of algorithm stability. Perhaps\nsurprisingly, we find that the key rationale behind MGDA -- updating along\nconflict-avoidant direction - may hinder dynamic weighting algorithms from\nachieving the optimal ${\\cal O}(1/\\sqrt{n})$ population risk, where $n$ is the\nnumber of training samples. We further demonstrate the impact of the\nvariability of dynamic weights on the three-way trade-off among optimization,\ngeneralization, and conflict avoidance that is unique in MOL. We showcase the\ngenerality of our theoretical framework by analyzing other existing stochastic\nMOL algorithms under the framework. Experiments on various multi-task learning\nbenchmarks are performed to demonstrate the practical applicability. Code is\navailable at https://github.com/heshandevaka/Trade-Off-MOL.\n","authors":["Lisha Chen","Heshan Fernando","Yiming Ying","Tianyi Chen"],"pdf_url":"https://arxiv.org/pdf/2305.20057v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03720v1","updated":"2023-10-05T17:40:09Z","published":"2023-10-05T17:40:09Z","title":"HeaP: Hierarchical Policies for Web Actions using LLMs","summary":"  Large language models (LLMs) have demonstrated remarkable capabilities in\nperforming a range of instruction following tasks in few and zero-shot\nsettings. However, teaching LLMs to perform tasks on the web presents\nfundamental challenges -- combinatorially large open-world tasks and variations\nacross web interfaces. We tackle these challenges by leveraging LLMs to\ndecompose web tasks into a collection of sub-tasks, each of which can be solved\nby a low-level, closed-loop policy. These policies constitute a shared grammar\nacross tasks, i.e., new web tasks can be expressed as a composition of these\npolicies. We propose a novel framework, Hierarchical Policies for Web Actions\nusing LLMs (HeaP), that learns a set of hierarchical LLM prompts from\ndemonstrations for planning high-level tasks and executing them via a sequence\nof low-level policies. We evaluate HeaP against a range of baselines on a suite\nof web tasks, including MiniWoB++, WebArena, a mock airline CRM, as well as\nlive website interactions, and show that it is able to outperform prior works\nusing orders of magnitude less data.\n","authors":["Paloma Sodhi","S. R. K. Branavan","Ryan McDonald"],"pdf_url":"https://arxiv.org/pdf/2310.03720v1.pdf","comment":"38 pages, 14 figures"},{"id":"http://arxiv.org/abs/2310.03718v1","updated":"2023-10-05T17:39:02Z","published":"2023-10-05T17:39:02Z","title":"Constraint-Conditioned Policy Optimization for Versatile Safe\n  Reinforcement Learning","summary":"  Safe reinforcement learning (RL) focuses on training reward-maximizing agents\nsubject to pre-defined safety constraints. Yet, learning versatile safe\npolicies that can adapt to varying safety constraint requirements during\ndeployment without retraining remains a largely unexplored and challenging\narea. In this work, we formulate the versatile safe RL problem and consider two\nprimary requirements: training efficiency and zero-shot adaptation capability.\nTo address them, we introduce the Conditioned Constrained Policy Optimization\n(CCPO) framework, consisting of two key modules: (1) Versatile Value Estimation\n(VVE) for approximating value functions under unseen threshold conditions, and\n(2) Conditioned Variational Inference (CVI) for encoding arbitrary constraint\nthresholds during policy optimization. Our extensive experiments demonstrate\nthat CCPO outperforms the baselines in terms of safety and task performance\nwhile preserving zero-shot adaptation capabilities to different constraint\nthresholds data-efficiently. This makes our approach suitable for real-world\ndynamic applications.\n","authors":["Yihang Yao","Zuxin Liu","Zhepeng Cen","Jiacheng Zhu","Wenhao Yu","Tingnan Zhang","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.03718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03716v1","updated":"2023-10-05T17:38:28Z","published":"2023-10-05T17:38:28Z","title":"A Long Way to Go: Investigating Length Correlations in RLHF","summary":"  Great successes have been reported using Reinforcement Learning from Human\nFeedback (RLHF) to align large language models. Open-source preference datasets\nand reward models have enabled wider experimentation beyond generic chat\nsettings, particularly to make systems more \"helpful\" for tasks like web\nquestion answering, summarization, and multi-turn dialogue. When optimizing for\nhelpfulness, RLHF has been consistently observed to drive models to produce\nlonger outputs. This paper demonstrates that optimizing for response length is\na significant factor behind RLHF's reported improvements in these settings.\nFirst, we study the relationship between reward and length for reward models\ntrained on three open-source preference datasets for helpfulness. Here, length\ncorrelates strongly with reward, and improvements in reward score are driven in\nlarge part by shifting the distribution over output lengths. We then explore\ninterventions during both RL and reward model learning to see if we can achieve\nthe same downstream improvements as RLHF without increasing length. While our\ninterventions mitigate length increases, they aren't uniformly effective across\nsettings. Furthermore, we find that even running RLHF with a reward based\nsolely on length can reproduce most of the downstream improvements over the\ninitial policy model, showing that reward models in these settings have a long\nway to go.\n","authors":["Prasann Singhal","Tanya Goyal","Jiacheng Xu","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2310.03716v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2210.01422v4","updated":"2023-10-05T17:38:13Z","published":"2022-10-04T07:21:49Z","title":"Time-Varying Propensity Score to Bridge the Gap between the Past and\n  Present","summary":"  Real-world deployment of machine learning models is challenging because data\nevolves over time. While no model can work when data evolves in an arbitrary\nfashion, if there is some pattern to these changes, we might be able to design\nmethods to address it. This paper addresses situations when data evolves\ngradually. We introduce a time-varying propensity score that can detect gradual\nshifts in the distribution of data which allows us to selectively sample past\ndata to update the model -- not just similar data from the past like that of a\nstandard propensity score but also data that evolved in a similar fashion in\nthe past. The time-varying propensity score is quite general: we demonstrate\ndifferent ways of implementing it and evaluate it on a variety of problems\nranging from supervised learning (e.g., image classification problems) where\ndata undergoes a sequence of gradual shifts, to reinforcement learning tasks\n(e.g., robotic manipulation and continuous control) where data shifts as the\npolicy or the task changes.\n","authors":["Rasool Fakoor","Jonas Mueller","Zachary C. Lipton","Pratik Chaudhari","Alexander J. Smola"],"pdf_url":"https://arxiv.org/pdf/2210.01422v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03714v1","updated":"2023-10-05T17:37:25Z","published":"2023-10-05T17:37:25Z","title":"DSPy: Compiling Declarative Language Model Calls into Self-Improving\n  Pipelines","summary":"  The ML community is rapidly exploring techniques for prompting language\nmodels (LMs) and for stacking them into pipelines that solve complex tasks.\nUnfortunately, existing LM pipelines are typically implemented using hard-coded\n\"prompt templates\", i.e. lengthy strings discovered via trial and error. Toward\na more systematic approach for developing and optimizing LM pipelines, we\nintroduce DSPy, a programming model that abstracts LM pipelines as text\ntransformation graphs, i.e. imperative computational graphs where LMs are\ninvoked through declarative modules. DSPy modules are parameterized, meaning\nthey can learn (by creating and collecting demonstrations) how to apply\ncompositions of prompting, finetuning, augmentation, and reasoning techniques.\nWe design a compiler that will optimize any DSPy pipeline to maximize a given\nmetric. We conduct two case studies, showing that succinct DSPy programs can\nexpress and optimize sophisticated LM pipelines that reason about math word\nproblems, tackle multi-hop retrieval, answer complex questions, and control\nagent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and\nllama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot\nprompting (generally by over 25% and 65%, respectively) and pipelines with\nexpert-created demonstrations (by up to 5-46% and 16-40%, respectively). On top\nof that, DSPy programs compiled to open and relatively small LMs like\n770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely\non expert-written prompt chains for proprietary GPT-3.5. DSPy is available at\nhttps://github.com/stanfordnlp/dspy\n","authors":["Omar Khattab","Arnav Singhvi","Paridhi Maheshwari","Zhiyuan Zhang","Keshav Santhanam","Sri Vardhamanan","Saiful Haq","Ashutosh Sharma","Thomas T. Joshi","Hanna Moazam","Heather Miller","Matei Zaharia","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2310.03714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03710v1","updated":"2023-10-05T17:36:16Z","published":"2023-10-05T17:36:16Z","title":"Agent Instructs Large Language Models to be General Zero-Shot Reasoners","summary":"  We introduce a method to improve the zero-shot reasoning abilities of large\nlanguage models on general language understanding tasks. Specifically, we build\nan autonomous agent to instruct the reasoning process of large language models.\nWe show this approach further unleashes the zero-shot reasoning abilities of\nlarge language models to more tasks. We study the performance of our method on\na wide set of datasets spanning generation, classification, and reasoning. We\nshow that our method generalizes to most tasks and obtains state-of-the-art\nzero-shot performance on 20 of the 29 datasets that we evaluate. For instance,\nour method boosts the performance of state-of-the-art large language models by\na large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and\nGPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement\nin reasoning is striking, with an average increase of 10.5%. With our method,\nLlama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%.\n","authors":["Nicholas Crispino","Kyle Montgomery","Fankun Zeng","Dawn Song","Chenguang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03708v1","updated":"2023-10-05T17:35:26Z","published":"2023-10-05T17:35:26Z","title":"Beyond One-Preference-for-All: Multi-Objective Direct Preference\n  Optimization","summary":"  Language models (LMs), despite aligning well with an average labeler through\nreinforcement learning from human feedback (RLHF), may not universally suit\ndiverse human preferences. Recent approaches therefore opt for customization by\ncollecting multi-dimensional feedback and creating distinct rewards for each\ndimension (e.g., helpfulness, harmlessness, honesty). LMs can then be tailored\nto different preferences using multi-objective RL (MORL) with different reward\nweightings. Yet, RL fine-tuning is unstable and resource-heavy, especially for\nMORLHF with diverse and usually conflicting objectives. In this paper, we\npresent Multi-Objective Direct Preference Optimization (MODPO), an RL-free\nalgorithm that extends Direct Preference Optimization (DPO) for multiple\nalignment objectives. Essentially, MODPO trains different LMs to represent\ndifferent collective reward models that combine all objectives with specific\nweightings. With a simple cross-entropy loss, the LMs optimized against the\nMODPO objective are analytically the exact solutions of the original MORLHF\nobjective. Empirical results in safety alignment and long-form question\nanswering confirm that MODPO matches or outperforms existing methods,\nefficiently producing a Pareto-optimal set of LMs that cater to diverse\npreferences with 3 times less computational resources compared with MORLHF.\n","authors":["Zhanhui Zhou","Jie Liu","Chao Yang","Jing Shao","Yu Liu","Xiangyu Yue","Wanli Ouyang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2310.03708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03707v1","updated":"2023-10-05T17:34:47Z","published":"2023-10-05T17:34:47Z","title":"OMG-ATTACK: Self-Supervised On-Manifold Generation of Transferable\n  Evasion Attacks","summary":"  Evasion Attacks (EA) are used to test the robustness of trained neural\nnetworks by distorting input data to misguide the model into incorrect\nclassifications. Creating these attacks is a challenging task, especially with\nthe ever-increasing complexity of models and datasets. In this work, we\nintroduce a self-supervised, computationally economical method for generating\nadversarial examples, designed for the unseen black-box setting. Adapting\ntechniques from representation learning, our method generates on-manifold EAs\nthat are encouraged to resemble the data distribution. These attacks are\ncomparable in effectiveness compared to the state-of-the-art when attacking the\nmodel trained on, but are significantly more effective when attacking unseen\nmodels, as the attacks are more related to the data rather than the model\nitself. Our experiments consistently demonstrate the method is effective across\nvarious models, unseen data categories, and even defended models, suggesting a\nsignificant role for on-manifold EAs when targeting unseen models.\n","authors":["Ofir Bar Tal","Adi Haviv","Amit H. Bermano"],"pdf_url":"https://arxiv.org/pdf/2310.03707v1.pdf","comment":"ICCV 2023, AROW Workshop"},{"id":"http://arxiv.org/abs/2303.16887v2","updated":"2023-10-05T17:32:26Z","published":"2023-03-29T17:56:36Z","title":"Towards Understanding the Effect of Pretraining Label Granularity","summary":"  In this paper, we study how the granularity of pretraining labels affects the\ngeneralization of deep neural networks in image classification tasks. We focus\non the \"fine-to-coarse\" transfer learning setting, where the pretraining label\nspace is more fine-grained than that of the target problem. Empirically, we\nshow that pretraining on the leaf labels of ImageNet21k produces better\ntransfer results on ImageNet1k than pretraining on other coarser granularity\nlevels, which supports the common practice used in the community.\nTheoretically, we explain the benefit of fine-grained pretraining by proving\nthat, for a data distribution satisfying certain hierarchy conditions, 1)\ncoarse-grained pretraining only allows a neural network to learn the \"common\"\nor \"easy-to-learn\" features well, while 2) fine-grained pretraining helps the\nnetwork learn the \"rarer\" or \"fine-grained\" features in addition to the common\nones, thus improving its accuracy on hard downstream test samples in which\ncommon features are missing or weak in strength. Furthermore, we perform\ncomprehensive experiments using the label hierarchies of iNaturalist 2021 and\nobserve that the following conditions, in addition to proper choice of label\ngranularity, enable the transfer to work well in practice: 1) the pretraining\ndataset needs to have a meaningful label hierarchy, and 2) the pretraining and\ntarget label functions need to align well.\n","authors":["Guan Zhe Hong","Yin Cui","Ariel Fuxman","Stanley H. Chan","Enming Luo"],"pdf_url":"https://arxiv.org/pdf/2303.16887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01856v4","updated":"2023-10-05T17:26:48Z","published":"2022-11-03T14:49:02Z","title":"Conditional Generative Models for Simulation of EMG During Naturalistic\n  Movements","summary":"  Numerical models of electromyographic (EMG) signals have provided a huge\ncontribution to our fundamental understanding of human neurophysiology and\nremain a central pillar of motor neuroscience and the development of\nhuman-machine interfaces. However, whilst modern biophysical simulations based\non finite element methods are highly accurate, they are extremely\ncomputationally expensive and thus are generally limited to modelling static\nsystems such as isometrically contracting limbs. As a solution to this problem,\nwe propose a transfer learning approach, in which a conditional generative\nmodel is trained to mimic the output of an advanced numerical model. To this\nend, we present BioMime, a conditional generative neural network trained\nadversarially to generate motor unit activation potential waveforms under a\nwide variety of volume conductor parameters. We demonstrate the ability of such\na model to predictively interpolate between a much smaller number of numerical\nmodel's outputs with a high accuracy. Consequently, the computational load is\ndramatically reduced, which allows the rapid simulation of EMG signals during\ntruly dynamic and naturalistic movements.\n","authors":["Shihan Ma","Alexander Kenneth Clarke","Kostiantyn Maksymenko","Samuel Deslauriers-Gauthier","Xinjun Sheng","Xiangyang Zhu","Dario Farina"],"pdf_url":"https://arxiv.org/pdf/2211.01856v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03696v1","updated":"2023-10-05T17:13:16Z","published":"2023-10-05T17:13:16Z","title":"Banach Space Optimality of Neural Architectures With Multivariate\n  Nonlinearities","summary":"  We investigate the variational optimality (specifically, the Banach space\noptimality) of a large class of neural architectures with multivariate\nnonlinearities/activation functions. To that end, we construct a new family of\nBanach spaces defined via a regularization operator and the $k$-plane\ntransform. We prove a representer theorem that states that the solution sets to\nlearning problems posed over these Banach spaces are completely characterized\nby neural architectures with multivariate nonlinearities. These optimal\narchitectures have skip connections and are tightly connected to orthogonal\nweight normalization and multi-index models, both of which have received\nconsiderable interest in the neural network community. Our framework is\ncompatible with a number of classical nonlinearities including the rectified\nlinear unit (ReLU) activation function, the norm activation function, and the\nradial basis functions found in the theory of thin-plate/polyharmonic splines.\nWe also show that the underlying spaces are special instances of reproducing\nkernel Banach spaces and variation spaces. Our results shed light on the\nregularity of functions learned by neural networks trained on data,\nparticularly with multivariate nonlinearities, and provide new theoretical\nmotivation for several architectural choices found in practice.\n","authors":["Rahul Parhi","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2310.03696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03695v1","updated":"2023-10-05T17:12:38Z","published":"2023-10-05T17:12:38Z","title":"Multimarginal generative modeling with stochastic interpolants","summary":"  Given a set of $K$ probability densities, we consider the multimarginal\ngenerative modeling problem of learning a joint distribution that recovers\nthese densities as marginals. The structure of this joint distribution should\nidentify multi-way correspondences among the prescribed marginals. We formalize\nan approach to this task within a generalization of the stochastic interpolant\nframework, leading to efficient learning algorithms built upon dynamical\ntransport of measure. Our generative models are defined by velocity and score\nfields that can be characterized as the minimizers of simple quadratic\nobjectives, and they are defined on a simplex that generalizes the time\nvariable in the usual dynamical transport framework. The resulting transport on\nthe simplex is influenced by all marginals, and we show that multi-way\ncorrespondences can be extracted. The identification of such correspondences\nhas applications to style transfer, algorithmic fairness, and data\ndecorruption. In addition, the multimarginal perspective enables an efficient\nalgorithm for reducing the dynamical transport cost in the ordinary\ntwo-marginal setting. We demonstrate these capacities with several numerical\nexamples.\n","authors":["Michael S. Albergo","Nicholas M. Boffi","Michael Lindsey","Eric Vanden-Eijnden"],"pdf_url":"https://arxiv.org/pdf/2310.03695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03693v1","updated":"2023-10-05T17:12:17Z","published":"2023-10-05T17:12:17Z","title":"Fine-tuning Aligned Language Models Compromises Safety, Even When Users\n  Do Not Intend To!","summary":"  Optimizing large language models (LLMs) for downstream use cases often\ninvolves the customization of pre-trained LLMs through further fine-tuning.\nMeta's open release of Llama models and OpenAI's APIs for fine-tuning GPT-3.5\nTurbo on custom datasets also encourage this practice. But, what are the safety\ncosts associated with such custom fine-tuning? We note that while existing\nsafety alignment infrastructures can restrict harmful behaviors of LLMs at\ninference time, they do not cover safety risks when fine-tuning privileges are\nextended to end-users. Our red teaming studies find that the safety alignment\nof LLMs can be compromised by fine-tuning with only a few adversarially\ndesigned training examples. For instance, we jailbreak GPT-3.5 Turbo's safety\nguardrails by fine-tuning it on only 10 such examples at a cost of less than\n$0.20 via OpenAI's APIs, making the model responsive to nearly any harmful\ninstructions. Disconcertingly, our research also reveals that, even without\nmalicious intent, simply fine-tuning with benign and commonly used datasets can\nalso inadvertently degrade the safety alignment of LLMs, though to a lesser\nextent. These findings suggest that fine-tuning aligned LLMs introduces new\nsafety risks that current safety infrastructures fall short of addressing --\neven if a model's initial safety alignment is impeccable, it is not necessarily\nto be maintained after custom fine-tuning. We outline and critically analyze\npotential mitigations and advocate for further research efforts toward\nreinforcing safety protocols for the custom fine-tuning of aligned LLMs.\n","authors":["Xiangyu Qi","Yi Zeng","Tinghao Xie","Pin-Yu Chen","Ruoxi Jia","Prateek Mittal","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2310.03693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03684v1","updated":"2023-10-05T17:01:53Z","published":"2023-10-05T17:01:53Z","title":"SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks","summary":"  Despite efforts to align large language models (LLMs) with human values,\nwidely-used LLMs such as GPT, Llama, Claude, and PaLM are susceptible to\njailbreaking attacks, wherein an adversary fools a targeted LLM into generating\nobjectionable content. To address this vulnerability, we propose SmoothLLM, the\nfirst algorithm designed to mitigate jailbreaking attacks on LLMs. Based on our\nfinding that adversarially-generated prompts are brittle to character-level\nchanges, our defense first randomly perturbs multiple copies of a given input\nprompt, and then aggregates the corresponding predictions to detect adversarial\ninputs. SmoothLLM reduces the attack success rate on numerous popular LLMs to\nbelow one percentage point, avoids unnecessary conservatism, and admits\nprovable guarantees on attack mitigation. Moreover, our defense uses\nexponentially fewer queries than existing attacks and is compatible with any\nLLM.\n","authors":["Alexander Robey","Eric Wong","Hamed Hassani","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2310.03684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00818v2","updated":"2023-10-05T17:00:23Z","published":"2023-10-01T23:17:55Z","title":"ECG-SL: Electrocardiogram(ECG) Segment Learning, a deep learning method\n  for ECG signal","summary":"  Electrocardiogram (ECG) is an essential signal in monitoring human heart\nactivities. Researchers have achieved promising results in leveraging ECGs in\nclinical applications with deep learning models. However, the mainstream deep\nlearning approaches usually neglect the periodic and formative attribute of the\nECG heartbeat waveform. In this work, we propose a novel ECG-Segment based\nLearning (ECG-SL) framework to explicitly model the periodic nature of ECG\nsignals. More specifically, ECG signals are first split into heartbeat\nsegments, and then structural features are extracted from each of the segments.\nBased on the structural features, a temporal model is designed to learn the\ntemporal information for various clinical tasks. Further, due to the fact that\nmassive ECG signals are available but the labeled data are very limited, we\nalso explore self-supervised learning strategy to pre-train the models,\nresulting significant improvement for downstream tasks. The proposed method\noutperforms the baseline model and shows competitive performances compared with\ntask-specific methods in three clinical applications: cardiac condition\ndiagnosis, sleep apnea detection, and arrhythmia classification. Further, we\nfind that the ECG-SL tends to focus more on each heartbeat's peak and ST range\nthan ResNet by visualizing the saliency maps.\n","authors":["Han Yu","Huiyuan Yang","Akane Sano"],"pdf_url":"https://arxiv.org/pdf/2310.00818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03675v1","updated":"2023-10-05T16:52:59Z","published":"2023-10-05T16:52:59Z","title":"Hadamard Domain Training with Integers for Class Incremental Quantized\n  Learning","summary":"  Continual learning is a desirable feature in many modern machine learning\napplications, which allows in-field adaptation and updating, ranging from\naccommodating distribution shift, to fine-tuning, and to learning new tasks.\nFor applications with privacy and low latency requirements, the compute and\nmemory demands imposed by continual learning can be cost-prohibitive for\nresource-constraint edge platforms. Reducing computational precision through\nfully quantized training (FQT) simultaneously reduces memory footprint and\nincreases compute efficiency for both training and inference. However,\naggressive quantization especially integer FQT typically degrades model\naccuracy to unacceptable levels. In this paper, we propose a technique that\nleverages inexpensive Hadamard transforms to enable low-precision training with\nonly integer matrix multiplications. We further determine which tensors need\nstochastic rounding and propose tiled matrix multiplication to enable low-bit\nwidth accumulators. We demonstrate the effectiveness of our technique on\nseveral human activity recognition datasets and CIFAR100 in a class incremental\nlearning setting. We achieve less than 0.5% and 3% accuracy degradation while\nwe quantize all matrix multiplications inputs down to 4-bits with 8-bit\naccumulators.\n","authors":["Martin Schiemer","Clemens JS Schaefer","Jayden Parker Vap","Mark James Horeni","Yu Emma Wang","Juan Ye","Siddharth Joshi"],"pdf_url":"https://arxiv.org/pdf/2310.03675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03655v1","updated":"2023-10-05T16:33:08Z","published":"2023-10-05T16:33:08Z","title":"Strategic Evaluation: Subjects, Evaluators, and Society","summary":"  A broad current application of algorithms is in formal and quantitative\nmeasures of murky concepts -- like merit -- to make decisions. When people\nstrategically respond to these sorts of evaluations in order to gain favorable\ndecision outcomes, their behavior can be subjected to moral judgments. They may\nbe described as 'gaming the system' or 'cheating,' or (in other cases)\ninvesting 'honest effort' or 'improving.' Machine learning literature on\nstrategic behavior has tried to describe these dynamics by emphasizing the\nefforts expended by decision subjects hoping to obtain a more favorable\nassessment -- some works offer ways to preempt or prevent such manipulations,\nsome differentiate 'gaming' from 'improvement' behavior, while others aim to\nmeasure the effort burden or disparate effects of classification systems. We\nbegin from a different starting point: that the design of an evaluation itself\ncan be understood as furthering goals held by the evaluator which may be\nmisaligned with broader societal goals. To develop the idea that evaluation\nrepresents a strategic interaction in which both the evaluator and the subject\nof their evaluation are operating out of self-interest, we put forward a model\nthat represents the process of evaluation using three interacting agents: a\ndecision subject, an evaluator, and society, representing a bundle of values\nand oversight mechanisms. We highlight our model's applicability to a number of\nsocial systems where one or two players strategically undermine the others'\ninterests to advance their own. Treating evaluators as themselves strategic\nallows us to re-cast the scrutiny directed at decision subjects, towards the\nincentives that underpin institutional designs of evaluations. The moral\nstanding of strategic behaviors often depend on the moral standing of the\nevaluations and incentives that provoke such behaviors.\n","authors":["Benjamin Laufer","Jon Kleinberg","Karen Levy","Helen Nissenbaum"],"pdf_url":"https://arxiv.org/pdf/2310.03655v1.pdf","comment":"12 pages, 2 figures, EAAMO 2023"},{"id":"http://arxiv.org/abs/2310.03652v1","updated":"2023-10-05T16:28:58Z","published":"2023-10-05T16:28:58Z","title":"Extreme sparsification of physics-augmented neural networks for\n  interpretable model discovery in mechanics","summary":"  Data-driven constitutive modeling with neural networks has received increased\ninterest in recent years due to its ability to easily incorporate physical and\nmechanistic constraints and to overcome the challenging and time-consuming task\nof formulating phenomenological constitutive laws that can accurately capture\nthe observed material response. However, even though neural network-based\nconstitutive laws have been shown to generalize proficiently, the generated\nrepresentations are not easily interpretable due to their high number of\ntrainable parameters. Sparse regression approaches exist that allow to\nobtaining interpretable expressions, but the user is tasked with creating a\nlibrary of model forms which by construction limits their expressiveness to the\nfunctional forms provided in the libraries. In this work, we propose to train\nregularized physics-augmented neural network-based constitutive models\nutilizing a smoothed version of $L^{0}$-regularization. This aims to maintain\nthe trustworthiness inherited by the physical constraints, but also enables\ninterpretability which has not been possible thus far on any type of machine\nlearning-based constitutive model where model forms were not assumed a-priory\nbut were actually discovered. During the training process, the network\nsimultaneously fits the training data and penalizes the number of active\nparameters, while also ensuring constitutive constraints such as thermodynamic\nconsistency. We show that the method can reliably obtain interpretable and\ntrustworthy constitutive models for compressible and incompressible\nhyperelasticity, yield functions, and hardening models for elastoplasticity,\nfor synthetic and experimental data.\n","authors":["Jan N. Fuhg","Reese E. Jones","Nikolaos Bouklas"],"pdf_url":"https://arxiv.org/pdf/2310.03652v1.pdf","comment":"34 pages, 19 Figures"},{"id":"http://arxiv.org/abs/2310.03647v1","updated":"2023-10-05T16:21:42Z","published":"2023-10-05T16:21:42Z","title":"Rethinking Fairness for Human-AI Collaboration","summary":"  Existing approaches to algorithmic fairness aim to ensure equitable outcomes\nif human decision-makers comply perfectly with algorithmic decisions. However,\nperfect compliance with the algorithm is rarely a reality or even a desirable\noutcome in human-AI collaboration. Yet, recent studies have shown that\nselective compliance with fair algorithms can amplify discrimination relative\nto the prior human policy. As a consequence, ensuring equitable outcomes\nrequires fundamentally different algorithmic design principles that ensure\nrobustness to the decision-maker's (a priori unknown) compliance pattern. We\ndefine the notion of compliance-robustly fair algorithmic recommendations that\nare guaranteed to (weakly) improve fairness in decisions, regardless of the\nhuman's compliance pattern. We propose a simple optimization strategy to\nidentify the best performance-improving compliance-robustly fair policy.\nHowever, we show that it may be infeasible to design algorithmic\nrecommendations that are simultaneously fair in isolation, compliance-robustly\nfair, and more accurate than the human policy; thus, if our goal is to improve\nthe equity and accuracy of human-AI collaboration, it may not be desirable to\nenforce traditional fairness constraints.\n","authors":["Haosen Ge","Hamsa Bastani","Osbert Bastani"],"pdf_url":"https://arxiv.org/pdf/2310.03647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03646v1","updated":"2023-10-05T16:21:36Z","published":"2023-10-05T16:21:36Z","title":"TRAM: Bridging Trust Regions and Sharpness Aware Minimization","summary":"  By reducing the curvature of the loss surface in the parameter space,\nSharpness-aware minimization (SAM) yields widespread robustness improvement\nunder domain transfer. Instead of focusing on parameters, however, this work\nconsiders the transferability of representations as the optimization target for\nout-of-domain generalization in a fine-tuning setup. To encourage the retention\nof transferable representations, we consider trust region-based fine-tuning\nmethods, which exploit task-specific skills without forgetting task-agnostic\nrepresentations from pre-training. We unify parameter- and representation-space\nsmoothing approaches by using trust region bounds to inform SAM-style\nregularizers on both of these optimization surfaces. We propose Trust Region\nAware Minimization (TRAM), a fine-tuning algorithm that optimizes for flat\nminima and smooth, informative representations without forgetting pre-trained\nstructure. We find that TRAM outperforms both sharpness-aware and trust\nregion-based optimization methods on cross-domain language modeling and\ncross-lingual transfer, where robustness to domain transfer and representation\ngenerality are critical for success. TRAM establishes a new standard in\ntraining generalizable models with minimal additional computation.\n","authors":["Tom Sherborne","Naomi Saphra","Pradeep Dasigi","Hao Peng"],"pdf_url":"https://arxiv.org/pdf/2310.03646v1.pdf","comment":"17 pages, 11 tables, 1 figure. Submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.03641v1","updated":"2023-10-05T16:13:29Z","published":"2023-10-05T16:13:29Z","title":"Distributional PAC-Learning from Nisan's Natural Proofs","summary":"  (Abridged) Carmosino et al. (2016) demonstrated that natural proofs of\ncircuit lower bounds for \\Lambda imply efficient algorithms for learning\n\\Lambda-circuits, but only over the uniform distribution, with membership\nqueries, and provided \\AC^0[p] \\subseteq \\Lambda. We consider whether this\nimplication can be generalized to \\Lambda \\not\\supseteq \\AC^0[p], and to\nlearning algorithms in Valiant's PAC model, which use only random examples and\nlearn over arbitrary example distributions. We give results of both positive\nand negative flavor.\n  On the negative side, we observe that if, for every circuit class \\Lambda,\nthe implication from natural proofs for \\Lambda to learning \\Lambda-circuits in\nValiant's PAC model holds, then there is a polynomial time solution to\nO(n^{1.5})-uSVP (unique Shortest Vector Problem), and polynomial time quantum\nsolutions to O(n^{1.5})-SVP (Shortest Vector Problem) and O(n^{1.5})-SIVP\n(Shortest Independent Vector Problem). This indicates that whether natural\nproofs for \\Lambda imply efficient learning algorithms for \\Lambda in Valiant's\nPAC model may depend on \\Lambda.\n  On the positive side, our main result is that specific natural proofs arising\nfrom a type of communication complexity argument (e.g., Nisan (1993), for\ndepth-2 majority circuits) imply PAC-learning algorithms in a new\ndistributional variant of Valiant's model. Our distributional PAC model is\nstronger than the average-case prediction model of Blum et al (1993) and the\nheuristic PAC model of Nanashima (2021), and has several important properties\nwhich make it of independent interest, such as being boosting-friendly. The\nmain applications of our result are new distributional PAC-learning algorithms\nfor depth-2 majority circuits, polytopes and DNFs over natural target\ndistributions, as well as the nonexistence of encoded-input weak PRFs that can\nbe evaluated by depth-2 majority circuits.\n","authors":["Ari Karchmer"],"pdf_url":"https://arxiv.org/pdf/2310.03641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03635v1","updated":"2023-10-05T16:09:48Z","published":"2023-10-05T16:09:48Z","title":"CLEVRER-Humans: Describing Physical and Causal Events the Human Way","summary":"  Building machines that can reason about physical events and their causal\nrelationships is crucial for flexible interaction with the physical world.\nHowever, most existing physical and causal reasoning benchmarks are exclusively\nbased on synthetically generated events and synthetic natural language\ndescriptions of causal relationships. This design brings up two issues. First,\nthere is a lack of diversity in both event types and natural language\ndescriptions; second, causal relationships based on manually-defined heuristics\nare different from human judgments. To address both shortcomings, we present\nthe CLEVRER-Humans benchmark, a video reasoning dataset for causal judgment of\nphysical events with human labels. We employ two techniques to improve data\ncollection efficiency: first, a novel iterative event cloze task to elicit a\nnew representation of events in videos, which we term Causal Event Graphs\n(CEGs); second, a data augmentation technique based on neural language\ngenerative models. We convert the collected CEGs into questions and answers to\nbe consistent with prior work. Finally, we study a collection of baseline\napproaches for CLEVRER-Humans question-answering, highlighting the great\nchallenges set forth by our benchmark.\n","authors":["Jiayuan Mao","Xuelin Yang","Xikun Zhang","Noah D. Goodman","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2310.03635v1.pdf","comment":"NeurIPS 2022 (Dataset and Benchmark Track). First two authors\n  contributed equally. Project page:\n  https://sites.google.com/stanford.edu/clevrer-humans/home"},{"id":"http://arxiv.org/abs/2305.12766v2","updated":"2023-10-05T16:04:43Z","published":"2023-05-22T06:45:02Z","title":"Explaining Emergent In-Context Learning as Kernel Regression","summary":"  Large language models (LLMs) have initiated a paradigm shift in transfer\nlearning. In contrast to the classic pretraining-then-finetuning procedure, in\norder to use LLMs for downstream prediction tasks, one only needs to provide a\nfew demonstrations, known as in-context examples, without adding more or\nupdating existing model parameters. This in-context learning (ICL) capability\nof LLMs is intriguing, and it is not yet fully understood how pretrained LLMs\nacquire such capabilities. In this paper, we investigate the reason why a\ntransformer-based language model can accomplish in-context learning after\npre-training on a general language corpus by proposing one hypothesis that LLMs\ncan simulate kernel regression with internal representations when faced with\nin-context examples. More concretely, we first prove that Bayesian inference on\nin-context prompts can be asymptotically understood as kernel regression $\\hat\ny = \\sum_i y_i K(x, x_i)/\\sum_i K(x, x_i)$ as the number of in-context\ndemonstrations grows. Then, we empirically investigate the in-context behaviors\nof language models. We find that during ICL, the attention and hidden features\nin LLMs match the behaviors of a kernel regression. Finally, our theory\nprovides insights into multiple phenomena observed in the ICL field: why\nretrieving demonstrative samples similar to test samples can help, why ICL\nperformance is sensitive to the output formats, and why ICL accuracy benefits\nfrom selecting in-distribution and representative samples.\n","authors":["Chi Han","Ziqi Wang","Han Zhao","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2305.12766v2.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.01751v3","updated":"2023-10-05T16:03:49Z","published":"2023-03-03T07:24:38Z","title":"Deep Momentum Multi-Marginal Schrödinger Bridge","summary":"  It is a crucial challenge to reconstruct population dynamics using unlabeled\nsamples from distributions at coarse time intervals. Recent approaches such as\nflow-based models or Schr\\\"odinger Bridge (SB) models have demonstrated\nappealing performance, yet the inferred sample trajectories either fail to\naccount for the underlying stochasticity or are $\\underline{D}$eep\n$\\underline{M}$omentum Multi-Marginal $\\underline{S}$chr\\\"odinger\n$\\underline{B}$ridge(DMSB), a novel computational framework that learns the\nsmooth measure-valued spline for stochastic systems that satisfy position\nmarginal constraints across time. By tailoring the celebrated Bregman Iteration\nand extending the Iteration Proportional Fitting to phase space, we manage to\nhandle high-dimensional multi-marginal trajectory inference tasks efficiently.\nOur algorithm outperforms baselines significantly, as evidenced by experiments\nfor synthetic datasets and a real-world single-cell RNA sequence dataset.\nAdditionally, the proposed approach can reasonably reconstruct the evolution of\nvelocity distribution, from position snapshots only, when there is a ground\ntruth velocity that is nevertheless inaccessible.\n","authors":["Tianrong Chen","Guan-Horng Liu","Molei Tao","Evangelos A. Theodorou"],"pdf_url":"https://arxiv.org/pdf/2303.01751v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03624v1","updated":"2023-10-05T16:01:29Z","published":"2023-10-05T16:01:29Z","title":"High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling\n  and Motion Planning","summary":"  A robot self-model is a task-agnostic representation of the robot's physical\nmorphology that can be used for motion planning tasks in absence of classical\ngeometric kinematic models. In particular, when the latter are hard to engineer\nor the robot's kinematics change unexpectedly, human-free self-modeling is a\nnecessary feature of truly autonomous agents. In this work, we leverage neural\nfields to allow a robot to self-model its kinematics as a neural-implicit query\nmodel learned only from 2D images annotated with camera poses and\nconfigurations. This enables significantly greater applicability than existing\napproaches which have been dependent on depth images or geometry knowledge. To\nthis end, alongside a curricular data sampling strategy, we propose a new\nencoder-based neural density field architecture for dynamic object-centric\nscenes conditioned on high numbers of degrees of freedom (DOFs). In a 7-DOF\nrobot test setup, the learned self-model achieves a Chamfer-L2 distance of 2%\nof the robot's workspace dimension. We demonstrate the capabilities of this\nmodel on a motion planning task as an exemplary downstream application.\n","authors":["Lennart Schulze","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2310.03624v1.pdf","comment":"ICCV 2023 Workshop on Neural Fields for Autonomous Driving and\n  Robotics (oral)"},{"id":"http://arxiv.org/abs/2309.12488v3","updated":"2023-10-05T15:59:43Z","published":"2023-09-21T21:15:51Z","title":"Sharpness-Aware Minimization and the Edge of Stability","summary":"  Recent experiments have shown that, often, when training a neural network\nwith gradient descent (GD) with a step size $\\eta$, the operator norm of the\nHessian of the loss grows until it approximately reaches $2/\\eta$, after which\nit fluctuates around this value. The quantity $2/\\eta$ has been called the\n\"edge of stability\" based on consideration of a local quadratic approximation\nof the loss. We perform a similar calculation to arrive at an \"edge of\nstability\" for Sharpness-Aware Minimization (SAM), a variant of GD which has\nbeen shown to improve its generalization. Unlike the case for GD, the resulting\nSAM-edge depends on the norm of the gradient. Using three deep learning\ntraining tasks, we see empirically that SAM operates on the edge of stability\nidentified by this analysis.\n","authors":["Philip M. Long","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2309.12488v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12266v2","updated":"2023-10-05T15:54:11Z","published":"2022-08-25T10:01:43Z","title":"Decoding speech perception from non-invasive brain recordings","summary":"  Decoding speech from brain activity is a long-awaited goal in both healthcare\nand neuroscience. Invasive devices have recently led to major milestones in\nthat regard: deep learning algorithms trained on intracranial recordings now\nstart to decode elementary linguistic features (e.g. letters, words,\nspectrograms). However, extending this approach to natural speech and\nnon-invasive brain recordings remains a major challenge. Here, we introduce a\nmodel trained with contrastive-learning to decode self-supervised\nrepresentations of perceived speech from the non-invasive recordings of a large\ncohort of healthy individuals. To evaluate this approach, we curate and\nintegrate four public datasets, encompassing 175 volunteers recorded with\nmagneto- or electro-encephalography (M/EEG), while they listened to short\nstories and isolated sentences. The results show that our model can identify,\nfrom 3 seconds of MEG signals, the corresponding speech segment with up to 41%\naccuracy out of more than 1,000 distinct possibilities on average across\nparticipants, and more than 80% in the very best participants - a performance\nthat allows the decoding of words and phrases absent from the training set. The\ncomparison of our model to a variety of baselines highlights the importance of\n(i) a contrastive objective, (ii) pretrained representations of speech and\n(iii) a common convolutional architecture simultaneously trained across\nmultiple participants. Finally, the analysis of the decoder's predictions\nsuggests that they primarily depend on lexical and contextual semantic\nrepresentations. Overall, this effective decoding of perceived speech from\nnon-invasive recordings delineates a promising path to decode language from\nbrain activity, without putting patients at risk for brain surgery.\n","authors":["Alexandre Défossez","Charlotte Caucheteux","Jérémy Rapin","Ori Kabeli","Jean-Rémi King"],"pdf_url":"https://arxiv.org/pdf/2208.12266v2.pdf","comment":"updated version following publication in Nature Machine Intelligence\n  (2023)"},{"id":"http://arxiv.org/abs/2310.03618v1","updated":"2023-10-05T15:51:36Z","published":"2023-10-05T15:51:36Z","title":"CLASSify: A Web-Based Tool for Machine Learning","summary":"  Machine learning classification problems are widespread in bioinformatics,\nbut the technical knowledge required to perform model training, optimization,\nand inference can prevent researchers from utilizing this technology. This\narticle presents an automated tool for machine learning classification problems\nto simplify the process of training models and producing results while\nproviding informative visualizations and insights into the data. This tool\nsupports both binary and multiclass classification problems, and it provides\naccess to a variety of models and methods. Synthetic data can be generated\nwithin the interface to fill missing values, balance class labels, or generate\nentirely new datasets. It also provides support for feature evaluation and\ngenerates explainability scores to indicate which features influence the output\nthe most. We present CLASSify, an open-source tool for simplifying the user\nexperience of solving classification problems without the need for knowledge of\nmachine learning.\n","authors":["Aaron D. Mullen","Samuel E. Armstrong","Jeff Talbert","V. K. Cody Bumgardner"],"pdf_url":"https://arxiv.org/pdf/2310.03618v1.pdf","comment":"10 pages, 11 figures (3 images, 5 graphs, 3 tables)"},{"id":"http://arxiv.org/abs/2201.02824v2","updated":"2023-10-05T15:51:11Z","published":"2022-01-08T13:04:03Z","title":"Optimal 1-Wasserstein Distance for WGANs","summary":"  The mathematical forces at work behind Generative Adversarial Networks raise\nchallenging theoretical issues. Motivated by the important question of\ncharacterizing the geometrical properties of the generated distributions, we\nprovide a thorough analysis of Wasserstein GANs (WGANs) in both the finite\nsample and asymptotic regimes. We study the specific case where the latent\nspace is univariate and derive results valid regardless of the dimension of the\noutput space. We show in particular that for a fixed sample size, the optimal\nWGANs are closely linked with connected paths minimizing the sum of the squared\nEuclidean distances between the sample points. We also highlight the fact that\nWGANs are able to approach (for the 1-Wasserstein distance) the target\ndistribution as the sample size tends to infinity, at a given convergence rate\nand provided the family of generative Lipschitz functions grows appropriately.\nWe derive in passing new results on optimal transport theory in the\nsemi-discrete setting.\n","authors":["Arthur Stéphanovitch","Ugo Tanielian","Benoît Cadre","Nicolas Klutchnikoff","Gérard Biau"],"pdf_url":"https://arxiv.org/pdf/2201.02824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03614v1","updated":"2023-10-05T15:49:04Z","published":"2023-10-05T15:49:04Z","title":"Adversarial Machine Learning for Social Good: Reframing the Adversary as\n  an Ally","summary":"  Deep Neural Networks (DNNs) have been the driving force behind many of the\nrecent advances in machine learning. However, research has shown that DNNs are\nvulnerable to adversarial examples -- input samples that have been perturbed to\nforce DNN-based models to make errors. As a result, Adversarial Machine\nLearning (AdvML) has gained a lot of attention, and researchers have\ninvestigated these vulnerabilities in various settings and modalities. In\naddition, DNNs have also been found to incorporate embedded bias and often\nproduce unexplainable predictions, which can result in anti-social AI\napplications. The emergence of new AI technologies that leverage Large Language\nModels (LLMs), such as ChatGPT and GPT-4, increases the risk of producing\nanti-social applications at scale. AdvML for Social Good (AdvML4G) is an\nemerging field that repurposes the AdvML bug to invent pro-social applications.\nRegulators, practitioners, and researchers should collaborate to encourage the\ndevelopment of pro-social applications and hinder the development of\nanti-social ones. In this work, we provide the first comprehensive review of\nthe emerging field of AdvML4G. This paper encompasses a taxonomy that\nhighlights the emergence of AdvML4G, a discussion of the differences and\nsimilarities between AdvML4G and AdvML, a taxonomy covering social good-related\nconcepts and aspects, an exploration of the motivations behind the emergence of\nAdvML4G at the intersection of ML4G and AdvML, and an extensive summary of the\nworks that utilize AdvML4G as an auxiliary tool for innovating pro-social\napplications. Finally, we elaborate upon various challenges and open research\nissues that require significant attention from the research community.\n","authors":["Shawqi Al-Maliki","Adnan Qayyum","Hassan Ali","Mohamed Abdallah","Junaid Qadir","Dinh Thai Hoang","Dusit Niyato","Ala Al-Fuqaha"],"pdf_url":"https://arxiv.org/pdf/2310.03614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03613v1","updated":"2023-10-05T15:48:41Z","published":"2023-10-05T15:48:41Z","title":"Solving a Class of Non-Convex Minimax Optimization in Federated Learning","summary":"  The minimax problems arise throughout machine learning applications, ranging\nfrom adversarial training and policy evaluation in reinforcement learning to\nAUROC maximization. To address the large-scale data challenges across multiple\nclients with communication-efficient distributed training, federated learning\n(FL) is gaining popularity. Many optimization algorithms for minimax problems\nhave been developed in the centralized setting (\\emph{i.e.} single-machine).\nNonetheless, the algorithm for minimax problems under FL is still\nunderexplored. In this paper, we study a class of federated nonconvex minimax\noptimization problems. We propose FL algorithms (FedSGDA+ and FedSGDA-M) and\nreduce existing complexity results for the most common minimax problems. For\nnonconvex-concave problems, we propose FedSGDA+ and reduce the communication\ncomplexity to $O(\\varepsilon^{-6})$. Under nonconvex-strongly-concave and\nnonconvex-PL minimax settings, we prove that FedSGDA-M has the best-known\nsample complexity of $O(\\kappa^{3} N^{-1}\\varepsilon^{-3})$ and the best-known\ncommunication complexity of $O(\\kappa^{2}\\varepsilon^{-2})$. FedSGDA-M is the\nfirst algorithm to match the best sample complexity $O(\\varepsilon^{-3})$\nachieved by the single-machine method under the nonconvex-strongly-concave\nsetting. Extensive experimental results on fair classification and AUROC\nmaximization show the efficiency of our algorithms.\n","authors":["Xidong Wu","Jianhui Sun","Zhengmian Hu","Aidong Zhang","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2310.03613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03611v1","updated":"2023-10-05T15:45:53Z","published":"2023-10-05T15:45:53Z","title":"GENER: A Parallel Layer Deep Learning Network To Detect Gene-Gene\n  Interactions From Gene Expression Data","summary":"  Detecting and discovering new gene interactions based on known gene\nexpressions and gene interaction data presents a significant challenge. Various\nstatistical and deep learning methods have attempted to tackle this challenge\nby leveraging the topological structure of gene interactions and gene\nexpression patterns to predict novel gene interactions. In contrast, some\napproaches have focused exclusively on utilizing gene expression profiles. In\nthis context, we introduce GENER, a parallel-layer deep learning network\ndesigned exclusively for the identification of gene-gene relationships using\ngene expression data. We conducted two training experiments and compared the\nperformance of our network with that of existing statistical and deep learning\napproaches. Notably, our model achieved an average AUROC score of 0.834 on the\ncombined BioGRID&DREAM5 dataset, outperforming competing methods in predicting\ngene-gene interactions.\n","authors":["Ahmed Fakhry Elnaggar","Raneem Ali Khafagy","Adriaan-Alexander Ludl"],"pdf_url":"https://arxiv.org/pdf/2310.03611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06092v4","updated":"2023-10-05T15:43:42Z","published":"2023-07-12T11:35:37Z","title":"Quantitative CLTs in Deep Neural Networks","summary":"  We study the distribution of a fully connected neural network with random\nGaussian weights and biases in which the hidden layer widths are proportional\nto a large constant $n$. Under mild assumptions on the non-linearity, we obtain\nquantitative bounds on normal approximations valid at large but finite $n$ and\nany fixed network depth. Our theorems show both for the finite-dimensional\ndistributions and the entire process, that the distance between a random fully\nconnected network (and its derivatives) to the corresponding infinite width\nGaussian process scales like $n^{-\\gamma}$ for $\\gamma>0$, with the exponent\ndepending on the metric used to measure discrepancy. Our bounds are strictly\nstronger in terms of their dependence on network width than any previously\navailable in the literature; in the one-dimensional case, we also prove that\nthey are optimal, i.e., we establish matching lower bounds.\n","authors":["Stefano Favaro","Boris Hanin","Domenico Marinucci","Ivan Nourdin","Giovanni Peccati"],"pdf_url":"https://arxiv.org/pdf/2307.06092v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03606v1","updated":"2023-10-05T15:36:47Z","published":"2023-10-05T15:36:47Z","title":"Comparing Time-Series Analysis Approaches Utilized in Research Papers to\n  Forecast COVID-19 Cases in Africa: A Literature Review","summary":"  This literature review aimed to compare various time-series analysis\napproaches utilized in forecasting COVID-19 cases in Africa. The study involved\na methodical search for English-language research papers published between\nJanuary 2020 and July 2023, focusing specifically on papers that utilized\ntime-series analysis approaches on COVID-19 datasets in Africa. A variety of\ndatabases including PubMed, Google Scholar, Scopus, and Web of Science were\nutilized for this process. The research papers underwent an evaluation process\nto extract relevant information regarding the implementation and performance of\nthe time-series analysis models. The study highlighted the different\nmethodologies employed, evaluating their effectiveness and limitations in\nforecasting the spread of the virus. The result of this review could contribute\ndeeper insights into the field, and future research should consider these\ninsights to improve time series analysis models and explore the integration of\ndifferent approaches for enhanced public health decision-making.\n","authors":["Ali Ebadi","Ebrahim Sahafizadeh"],"pdf_url":"https://arxiv.org/pdf/2310.03606v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2310.03605v1","updated":"2023-10-05T15:36:35Z","published":"2023-10-05T15:36:35Z","title":"FASER: Binary Code Similarity Search through the use of Intermediate\n  Representations","summary":"  Being able to identify functions of interest in cross-architecture software\nis useful whether you are analysing for malware, securing the software supply\nchain or conducting vulnerability research. Cross-Architecture Binary Code\nSimilarity Search has been explored in numerous studies and has used a wide\nrange of different data sources to achieve its goals. The data sources\ntypically used draw on common structures derived from binaries such as function\ncontrol flow graphs or binary level call graphs, the output of the disassembly\nprocess or the outputs of a dynamic analysis approach. One data source which\nhas received less attention is binary intermediate representations. Binary\nIntermediate representations possess two interesting properties: they are cross\narchitecture by their very nature and encode the semantics of a function\nexplicitly to support downstream usage. Within this paper we propose Function\nas a String Encoded Representation (FASER) which combines long document\ntransformers with the use of intermediate representations to create a model\ncapable of cross architecture function search without the need for manual\nfeature engineering, pre-training or a dynamic analysis step. We compare our\napproach against a series of baseline approaches for two tasks; A general\nfunction search task and a targeted vulnerability search task. Our approach\ndemonstrates strong performance across both tasks, performing better than all\nbaseline approaches.\n","authors":["Josh Collyer","Tim Watson","Iain Phillips"],"pdf_url":"https://arxiv.org/pdf/2310.03605v1.pdf","comment":"10 pages, To be presented as Conference on Applied Machine Learning\n  for Information Security"},{"id":"http://arxiv.org/abs/2304.06715v3","updated":"2023-10-05T15:29:01Z","published":"2023-04-13T17:59:03Z","title":"Evaluating the Robustness of Interpretability Methods through\n  Explanation Invariance and Equivariance","summary":"  Interpretability methods are valuable only if their explanations faithfully\ndescribe the explained model. In this work, we consider neural networks whose\npredictions are invariant under a specific symmetry group. This includes\npopular architectures, ranging from convolutional to graph neural networks. Any\nexplanation that faithfully explains this type of model needs to be in\nagreement with this invariance property. We formalize this intuition through\nthe notion of explanation invariance and equivariance by leveraging the\nformalism from geometric deep learning. Through this rigorous formalism, we\nderive (1) two metrics to measure the robustness of any interpretability method\nwith respect to the model symmetry group; (2) theoretical robustness guarantees\nfor some popular interpretability methods and (3) a systematic approach to\nincrease the invariance of any interpretability method with respect to a\nsymmetry group. By empirically measuring our metrics for explanations of models\nassociated with various modalities and symmetry groups, we derive a set of 5\nguidelines to allow users and developers of interpretability methods to produce\nrobust explanations.\n","authors":["Jonathan Crabbé","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2304.06715v3.pdf","comment":"Presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.03597v1","updated":"2023-10-05T15:20:35Z","published":"2023-10-05T15:20:35Z","title":"Sampling via Gradient Flows in the Space of Probability Measures","summary":"  Sampling a target probability distribution with an unknown normalization\nconstant is a fundamental challenge in computational science and engineering.\nRecent work shows that algorithms derived by considering gradient flows in the\nspace of probability measures open up new avenues for algorithm development.\nThis paper makes three contributions to this sampling approach by scrutinizing\nthe design components of such gradient flows. Any instantiation of a gradient\nflow for sampling needs an energy functional and a metric to determine the\nflow, as well as numerical approximations of the flow to derive algorithms. Our\nfirst contribution is to show that the Kullback-Leibler divergence, as an\nenergy functional, has the unique property (among all f-divergences) that\ngradient flows resulting from it do not depend on the normalization constant of\nthe target distribution. Our second contribution is to study the choice of\nmetric from the perspective of invariance. The Fisher-Rao metric is known as\nthe unique choice (up to scaling) that is diffeomorphism invariant. As a\ncomputationally tractable alternative, we introduce a relaxed, affine\ninvariance property for the metrics and gradient flows. In particular, we\nconstruct various affine invariant Wasserstein and Stein gradient flows. Affine\ninvariant gradient flows are shown to behave more favorably than their\nnon-affine-invariant counterparts when sampling highly anisotropic\ndistributions, in theory and by using particle methods. Our third contribution\nis to study, and develop efficient algorithms based on Gaussian approximations\nof the gradient flows; this leads to an alternative to particle methods. We\nestablish connections between various Gaussian approximate gradient flows,\ndiscuss their relation to gradient methods arising from parametric variational\ninference, and study their convergence properties both theoretically and\nnumerically.\n","authors":["Yifan Chen","Daniel Zhengyu Huang","Jiaoyang Huang","Sebastian Reich","Andrew M Stuart"],"pdf_url":"https://arxiv.org/pdf/2310.03597v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.11024"},{"id":"http://arxiv.org/abs/2310.03589v1","updated":"2023-10-05T15:14:00Z","published":"2023-10-05T15:14:00Z","title":"TimeGPT-1","summary":"  In this paper, we introduce TimeGPT, the first foundation model for time\nseries, capable of generating accurate predictions for diverse datasets not\nseen during training. We evaluate our pre-trained model against established\nstatistical, machine learning, and deep learning methods, demonstrating that\nTimeGPT zero-shot inference excels in performance, efficiency, and simplicity.\nOur study provides compelling evidence that insights from other domains of\nartificial intelligence can be effectively applied to time series analysis. We\nconclude that large-scale time series models offer an exciting opportunity to\ndemocratize access to precise predictions and reduce uncertainty by leveraging\nthe capabilities of contemporary advancements in deep learning.\n","authors":["Azul Garza","Max Mergenthaler-Canseco"],"pdf_url":"https://arxiv.org/pdf/2310.03589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.05120v4","updated":"2023-10-05T15:11:54Z","published":"2021-12-09T18:54:29Z","title":"On Convergence of Federated Averaging Langevin Dynamics","summary":"  We propose a federated averaging Langevin algorithm (FA-LD) for uncertainty\nquantification and mean predictions with distributed clients. In particular, we\ngeneralize beyond normal posterior distributions and consider a general class\nof models. We develop theoretical guarantees for FA-LD for strongly log-concave\ndistributions with non-i.i.d data and study how the injected noise and the\nstochastic-gradient noise, the heterogeneity of data, and the varying learning\nrates affect the convergence. Such an analysis sheds light on the optimal\nchoice of local updates to minimize communication costs. Important to our\napproach is that the communication efficiency does not deteriorate with the\ninjected noise in the Langevin algorithms. In addition, we examine in our FA-LD\nalgorithm both independent and correlated noise used over different clients. We\nobserve there is a trade-off between the pairs among communication, accuracy,\nand data privacy. As local devices may become inactive in federated networks,\nwe also show convergence results based on different averaging schemes where\nonly partial device updates are available. In such a case, we discover an\nadditional bias that does not decay to zero.\n","authors":["Wei Deng","Qian Zhang","Yi-An Ma","Zhao Song","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2112.05120v4.pdf","comment":"A polished proof without the federated formulation of Langevin\n  diffusion to avoid confusion"},{"id":"http://arxiv.org/abs/2304.01150v2","updated":"2023-10-05T15:09:05Z","published":"2023-04-03T17:14:19Z","title":"Algebraic and Geometric Models for Space Networking","summary":"  In this paper we introduce some new algebraic and geometric perspectives on\nnetworked space communications. Our main contribution is a novel definition of\na time-varying graph (TVG), defined in terms of a matrix with values in subsets\nof the real line P(R). We leverage semi-ring properties of P(R) to model\nmulti-hop communication in a TVG using matrix multiplication and a truncated\nKleene star. This leads to novel statistics on the communication capacity of\nTVGs called lifetime curves, which we generate for large samples of randomly\nchosen STARLINK satellites, whose connectivity is modeled over day-long\nsimulations. Determining when a large subsample of STARLINK is temporally\nstrongly connected is further analyzed using novel metrics introduced here that\nare inspired by topological data analysis (TDA). To better model networking\nscenarios between the Earth and Mars, we introduce various semi-rings capable\nof modeling propagation delay as well as protocols common to Delay Tolerant\nNetworking (DTN), such as store-and-forward. Finally, we illustrate the\napplicability of zigzag persistence for featurizing different space networks\nand demonstrate the efficacy of K-Nearest Neighbors (KNN) classification for\ndistinguishing Earth-Mars and Earth-Moon satellite systems using time-varying\ntopology alone.\n","authors":["William Bernardoni","Robert Cardona","Jacob Cleveland","Justin Curry","Robert Green","Brian Heller","Alan Hylton","Tung Lam","Robert Kassouf-Short"],"pdf_url":"https://arxiv.org/pdf/2304.01150v2.pdf","comment":"Figures updated and improved based on more exhaustive simulations.\n  Conjecture 2.27 now has weak and strong variations"},{"id":"http://arxiv.org/abs/2310.03585v1","updated":"2023-10-05T15:08:37Z","published":"2023-10-05T15:08:37Z","title":"Smoothing Methods for Automatic Differentiation Across Conditional\n  Branches","summary":"  Programs involving discontinuities introduced by control flow constructs such\nas conditional branches pose challenges to mathematical optimization methods\nthat assume a degree of smoothness in the objective function's response\nsurface. Smooth interpretation (SI) is a form of abstract interpretation that\napproximates the convolution of a program's output with a Gaussian kernel, thus\nsmoothing its output in a principled manner. Here, we combine SI with automatic\ndifferentiation (AD) to efficiently compute gradients of smoothed programs. In\ncontrast to AD across a regular program execution, these gradients also capture\nthe effects of alternative control flow paths. The combination of SI with AD\nenables the direct gradient-based parameter synthesis for branching programs,\nallowing for instance the calibration of simulation models or their combination\nwith neural network models in machine learning pipelines. We detail the effects\nof the approximations made for tractability in SI and propose a novel Monte\nCarlo estimator that avoids the underlying assumptions by estimating the\nsmoothed programs' gradients through a combination of AD and sampling. Using\nDiscoGrad, our tool for automatically translating simple C++ programs to a\nsmooth differentiable form, we perform an extensive evaluation. We compare the\ncombination of SI with AD and our Monte Carlo estimator to existing\ngradient-free and stochastic methods on four non-trivial and originally\ndiscontinuous problems ranging from classical simulation-based optimization to\nneural network-driven control. While the optimization progress with the\nSI-based estimator depends on the complexity of the programs' control flow, our\nMonte Carlo estimator is competitive in all problems, exhibiting the fastest\nconvergence by a substantial margin in our highest-dimensional problem.\n","authors":["Justin N. Kreikemeyer","Philipp Andelfinger"],"pdf_url":"https://arxiv.org/pdf/2310.03585v1.pdf","comment":"21 pages, 17 figures"},{"id":"http://arxiv.org/abs/2305.16102v2","updated":"2023-10-05T15:04:27Z","published":"2023-05-25T14:31:59Z","title":"Demystifying Oversmoothing in Attention-Based Graph Neural Networks","summary":"  Oversmoothing in Graph Neural Networks (GNNs) refers to the phenomenon where\nincreasing network depth leads to homogeneous node representations. While\nprevious work has established that Graph Convolutional Networks (GCNs)\nexponentially lose expressive power, it remains controversial whether the graph\nattention mechanism can mitigate oversmoothing. In this work, we provide a\ndefinitive answer to this question through a rigorous mathematical analysis, by\nviewing attention-based GNNs as nonlinear time-varying dynamical systems and\nincorporating tools and techniques from the theory of products of inhomogeneous\nmatrices and the joint spectral radius. We establish that, contrary to popular\nbelief, the graph attention mechanism cannot prevent oversmoothing and loses\nexpressive power exponentially. The proposed framework extends the existing\nresults on oversmoothing for symmetric GCNs to a significantly broader class of\nGNN models, including random walk GCNs, Graph Attention Networks (GATs) and\n(graph) transformers. In particular, our analysis accounts for asymmetric,\nstate-dependent and time-varying aggregation operators and a wide range of\ncommon nonlinear activation functions, such as ReLU, LeakyReLU, GELU and SiLU.\n","authors":["Xinyi Wu","Amir Ajorlou","Zihui Wu","Ali Jadbabaie"],"pdf_url":"https://arxiv.org/pdf/2305.16102v2.pdf","comment":"NeurIPS 2023 spotlight. New remarks added"},{"id":"http://arxiv.org/abs/2310.03581v1","updated":"2023-10-05T15:01:31Z","published":"2023-10-05T15:01:31Z","title":"Resilient Legged Local Navigation: Learning to Traverse with Compromised\n  Perception End-to-End","summary":"  Autonomous robots must navigate reliably in unknown environments even under\ncompromised exteroceptive perception, or perception failures. Such failures\noften occur when harsh environments lead to degraded sensing, or when the\nperception algorithm misinterprets the scene due to limited generalization. In\nthis paper, we model perception failures as invisible obstacles and pits, and\ntrain a reinforcement learning (RL) based local navigation policy to guide our\nlegged robot. Unlike previous works relying on heuristics and anomaly detection\nto update navigational information, we train our navigation policy to\nreconstruct the environment information in the latent space from corrupted\nperception and react to perception failures end-to-end. To this end, we\nincorporate both proprioception and exteroception into our policy inputs,\nthereby enabling the policy to sense collisions on different body parts and\npits, prompting corresponding reactions. We validate our approach in simulation\nand on the real quadruped robot ANYmal running in real-time (<10 ms CPU\ninference). In a quantitative comparison with existing heuristic-based locally\nreactive planners, our policy increases the success rate over 30% when facing\nperception failures. Project Page: https://bit.ly/45NBTuh.\n","authors":["Jin Jin","Chong Zhang","Jonas Frey","Nikita Rudin","Matias Mattamala","Cesar Cadena","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2310.03581v1.pdf","comment":"Website and videos are available at our Project Page:\n  https://bit.ly/45NBTuh"},{"id":"http://arxiv.org/abs/2310.03578v1","updated":"2023-10-05T14:59:18Z","published":"2023-10-05T14:59:18Z","title":"Targeted Adversarial Attacks on Generalizable Neural Radiance Fields","summary":"  Neural Radiance Fields (NeRFs) have recently emerged as a powerful tool for\n3D scene representation and rendering. These data-driven models can learn to\nsynthesize high-quality images from sparse 2D observations, enabling realistic\nand interactive scene reconstructions. However, the growing usage of NeRFs in\ncritical applications such as augmented reality, robotics, and virtual\nenvironments could be threatened by adversarial attacks.\n  In this paper we present how generalizable NeRFs can be attacked by both\nlow-intensity adversarial attacks and adversarial patches, where the later\ncould be robust enough to be used in real world applications. We also\ndemonstrate targeted attacks, where a specific, predefined output scene is\ngenerated by these attack with success.\n","authors":["Andras Horvath","Csaba M. Jozsa"],"pdf_url":"https://arxiv.org/pdf/2310.03578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17357v3","updated":"2023-10-05T14:53:57Z","published":"2023-09-29T16:03:25Z","title":"Module-wise Training of Neural Networks via the Minimizing Movement\n  Scheme","summary":"  Greedy layer-wise or module-wise training of neural networks is compelling in\nconstrained and on-device settings where memory is limited, as it circumvents a\nnumber of problems of end-to-end back-propagation. However, it suffers from a\nstagnation problem, whereby early layers overfit and deeper layers stop\nincreasing the test accuracy after a certain depth. We propose to solve this\nissue by introducing a module-wise regularization inspired by the minimizing\nmovement scheme for gradient flows in distribution space. We call the method\nTRGL for Transport Regularized Greedy Learning and study it theoretically,\nproving that it leads to greedy modules that are regular and that progressively\nsolve the task. Experimentally, we show improved accuracy of module-wise\ntraining of various architectures such as ResNets, Transformers and VGG, when\nour regularization is added, superior to that of other module-wise training\nmethods and often to end-to-end training, with as much as 60% less memory\nusage.\n","authors":["Skander Karkar","Ibrahim Ayed","Emmanuel de Bézenac","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2309.17357v3.pdf","comment":"NeurIPS 2023. arXiv admin note: text overlap with arXiv:2210.00949"},{"id":"http://arxiv.org/abs/2310.03575v1","updated":"2023-10-05T14:53:40Z","published":"2023-10-05T14:53:40Z","title":"Analysis of learning a flow-based generative model from limited sample\n  complexity","summary":"  We study the problem of training a flow-based generative model, parametrized\nby a two-layer autoencoder, to sample from a high-dimensional Gaussian mixture.\nWe provide a sharp end-to-end analysis of the problem. First, we provide a\ntight closed-form characterization of the learnt velocity field, when\nparametrized by a shallow denoising auto-encoder trained on a finite number $n$\nof samples from the target distribution. Building on this analysis, we provide\na sharp description of the corresponding generative flow, which pushes the base\nGaussian density forward to an approximation of the target density. In\nparticular, we provide closed-form formulae for the distance between the mean\nof the generated mixture and the mean of the target mixture, which we show\ndecays as $\\Theta_n(\\frac{1}{n})$. Finally, this rate is shown to be in fact\nBayes-optimal.\n","authors":["Hugo Cui","Florent Krzakala","Eric Vanden-Eijnden","Lenka Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2310.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03572v1","updated":"2023-10-05T14:43:16Z","published":"2023-10-05T14:43:16Z","title":"Residual Multi-Fidelity Neural Network Computing","summary":"  In this work, we consider the general problem of constructing a neural\nnetwork surrogate model using multi-fidelity information. Given an inexpensive\nlow-fidelity and an expensive high-fidelity computational model, we present a\nresidual multi-fidelity computational framework that formulates the correlation\nbetween models as a residual function, a possibly non-linear mapping between 1)\nthe shared input space of the models together with the low-fidelity model\noutput and 2) the discrepancy between the two model outputs. To accomplish\nthis, we train two neural networks to work in concert. The first network learns\nthe residual function on a small set of high-fidelity and low-fidelity data.\nOnce trained, this network is used to generate additional synthetic\nhigh-fidelity data, which is used in the training of a second network. This\nsecond network, once trained, acts as our surrogate for the high-fidelity\nquantity of interest. We present three numerical examples to demonstrate the\npower of the proposed framework. In particular, we show that dramatic savings\nin computational cost may be achieved when the output predictions are desired\nto be accurate within small tolerances.\n","authors":["Owen Davis","Mohammad Motamed","Raul Tempone"],"pdf_url":"https://arxiv.org/pdf/2310.03572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05435v2","updated":"2023-10-05T14:40:54Z","published":"2023-07-11T16:57:17Z","title":"One-Versus-Others Attention: Scalable Multimodal Integration","summary":"  Multimodal learning models have become increasingly important as they surpass\nsingle-modality approaches on diverse tasks ranging from question-answering to\nautonomous driving. Despite the importance of multimodal learning, existing\nefforts focus on NLP applications, where the number of modalities is typically\nless than four (audio, video, text, images). However, data inputs in other\ndomains, such as the medical field, may include X-rays, PET scans, MRIs,\ngenetic screening, clinical notes, and more, creating a need for both efficient\nand accurate information fusion. Many state-of-the-art models rely on pairwise\ncross-modal attention, which does not scale well for applications with more\nthan three modalities. For $n$ modalities, computing attention will result in\n$n \\choose 2$ operations, potentially requiring considerable amounts of\ncomputational resources. To address this, we propose a new domain-neutral\nattention mechanism, One-Versus-Others (OvO) attention, that scales linearly\nwith the number of modalities and requires only $n$ attention operations, thus\noffering a significant reduction in computational complexity compared to\nexisting cross-modal attention algorithms. Using three diverse real-world\ndatasets as well as an additional simulation experiment, we show that our\nmethod improves performance compared to popular fusion techniques while\ndecreasing computation costs.\n","authors":["Michal Golovanevsky","Eva Schiller","Akira Nair","Ritambhara Singh","Carsten Eickhoff"],"pdf_url":"https://arxiv.org/pdf/2307.05435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05250v2","updated":"2023-10-05T14:32:15Z","published":"2022-05-11T03:36:35Z","title":"Spatial-temporal associations representation and application for process\n  monitoring using graph convolution neural network","summary":"  Thank you very much for the attention and concern of colleagues and scholars\nin this work. With the comments and guidance of experts, editors, and\nreviewers, this work has been accepted for publishing in the journal \"Process\nSafety and Environmental Protection\". The theme of this paper relies on the\nSpatial-temporal associations of numerous variables in the same industrial\nprocesses, which refers to numerous variables obtained in dynamic industrial\nprocesses with Spatial-temporal correlation characteristics, i.e., these\nvariables are not only highly correlated in time but also interrelated in\nspace. To handle this problem, three key issues need to be well addressed:\nvariable characteristics modeling and representation, graph network\nconstruction (temporal information), and graph characteristics perception. The\nfirst issue is implemented by assuming the data follows one improved Gaussian\ndistribution, while the graph network can be defined by the monitoring\nvariables and their edges which are calculated by their characteristics in\ntime. Finally, these networks corresponding to process states at different\ntimes are fed into a graph convolutional neural network to implement graph\nclassification to achieve process monitoring. A benchmark experiment (Tennessee\nEastman chemical process) and one application study (cobalt purification from\nzinc solution) are employed to demonstrate the feasibility and applicability of\nthis paper.\n","authors":["Hao Ren","Xiaojun Liang","Chunhua Yang","Zhiwen Chen","Weihua Gui"],"pdf_url":"https://arxiv.org/pdf/2205.05250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03563v1","updated":"2023-10-05T14:27:06Z","published":"2023-10-05T14:27:06Z","title":"BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance\n  Fields","summary":"  We aim to improve the Inverted Neural Radiance Fields (iNeRF) algorithm which\ndefines the image pose estimation problem as a NeRF based iterative linear\noptimization. NeRFs are novel neural space representation models that can\nsynthesize photorealistic novel views of real-world scenes or objects. Our\ncontributions are as follows: we extend the localization optimization objective\nwith a depth-based loss function, we introduce a multi-image based loss\nfunction where a sequence of images with known relative poses are used without\nincreasing the computational complexity, we omit hierarchical sampling during\nvolumetric rendering, meaning only the coarse model is used for pose\nestimation, and we how that by extending the sampling interval convergence can\nbe achieved even or higher initial pose estimate errors. With the proposed\nmodifications the convergence speed is significantly improved, and the basin of\nconvergence is substantially extended.\n","authors":["Ágoston István Csehi","Csaba Máté Józsa"],"pdf_url":"https://arxiv.org/pdf/2310.03563v1.pdf","comment":"Accepted to Nerf4ADR workshop of ICCV23 conference"},{"id":"http://arxiv.org/abs/2302.04054v6","updated":"2023-10-05T14:19:32Z","published":"2023-02-08T13:47:00Z","title":"Towards Inferential Reproducibility of Machine Learning Research","summary":"  Reliability of machine learning evaluation -- the consistency of observed\nevaluation scores across replicated model training runs -- is affected by\nseveral sources of nondeterminism which can be regarded as measurement noise.\nCurrent tendencies to remove noise in order to enforce reproducibility of\nresearch results neglect inherent nondeterminism at the implementation level\nand disregard crucial interaction effects between algorithmic noise factors and\ndata properties. This limits the scope of conclusions that can be drawn from\nsuch experiments. Instead of removing noise, we propose to incorporate several\nsources of variance, including their interaction with data properties, into an\nanalysis of significance and reliability of machine learning evaluation, with\nthe aim to draw inferences beyond particular instances of trained models. We\nshow how to use linear mixed effects models (LMEMs) to analyze performance\nevaluation scores, and to conduct statistical inference with a generalized\nlikelihood ratio test (GLRT). This allows us to incorporate arbitrary sources\nof noise like meta-parameter variations into statistical significance testing,\nand to assess performance differences conditional on data properties.\nFurthermore, a variance component analysis (VCA) enables the analysis of the\ncontribution of noise sources to overall variance and the computation of a\nreliability coefficient by the ratio of substantial to total variance.\n","authors":["Michael Hagmann","Philipp Meier","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2302.04054v6.pdf","comment":"Published at ICLR 2023"},{"id":"http://arxiv.org/abs/2301.09350v2","updated":"2023-10-05T14:17:39Z","published":"2023-01-23T10:33:22Z","title":"Large-scale investigation of weakly-supervised deep learning for the\n  fine-grained semantic indexing of biomedical literature","summary":"  Objective: Semantic indexing of biomedical literature is usually done at the\nlevel of MeSH descriptors with several related but distinct biomedical concepts\noften grouped together and treated as a single topic. This study proposes a new\nmethod for the automated refinement of subject annotations at the level of MeSH\nconcepts. Methods: Lacking labelled data, we rely on weak supervision based on\nconcept occurrence in the abstract of an article, which is also enhanced by\ndictionary-based heuristics. In addition, we investigate deep learning\napproaches, making design choices to tackle the particular challenges of this\ntask. The new method is evaluated on a large-scale retrospective scenario,\nbased on concepts that have been promoted to descriptors. Results: In our\nexperiments concept occurrence was the strongest heuristic achieving a macro-F1\nscore of about 0.63 across several labels. The proposed method improved it\nfurther by more than 4pp. Conclusion: The results suggest that concept\noccurrence is a strong heuristic for refining the coarse-grained labels at the\nlevel of MeSH concepts and the proposed method improves it further.\n","authors":["Anastasios Nentidis","Thomas Chatzopoulos","Anastasia Krithara","Grigorios Tsoumakas","Georgios Paliouras"],"pdf_url":"https://arxiv.org/pdf/2301.09350v2.pdf","comment":"26 pages, 5 figures, 4 tables. A more concise version"},{"id":"http://arxiv.org/abs/2310.03556v1","updated":"2023-10-05T14:08:42Z","published":"2023-10-05T14:08:42Z","title":"Stable Training of Probabilistic Models Using the Leave-One-Out Maximum\n  Log-Likelihood Objective","summary":"  Probabilistic modelling of power systems operation and planning processes\ndepends on data-driven methods, which require sufficiently large datasets. When\nhistorical data lacks this, it is desired to model the underlying data\ngeneration mechanism as a probability distribution to assess the data quality\nand generate more data, if needed. Kernel density estimation (KDE) based models\nare popular choices for this task, but they fail to adapt to data regions with\nvarying densities. In this paper, an adaptive KDE model is employed to\ncircumvent this, where each kernel in the model has an individual bandwidth.\nThe leave-one-out maximum log-likelihood (LOO-MLL) criterion is proposed to\nprevent the singular solutions that the regular MLL criterion gives rise to,\nand it is proven that LOO-MLL prevents these. Relying on this guaranteed\nrobustness, the model is extended by assigning learnable weights to the\nkernels. In addition, a modified expectation-maximization algorithm is employed\nto accelerate the optimization speed reliably. The performance of the proposed\nmethod and models are exhibited on two power systems datasets using different\nstatistical tests and by comparison with Gaussian mixture models. Results show\nthat the proposed models have promising performance, in addition to their\nsingularity prevention guarantees.\n","authors":["Kutay Bölat","Simon H. Tindemans","Peter Palensky"],"pdf_url":"https://arxiv.org/pdf/2310.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09874v3","updated":"2023-10-05T14:06:32Z","published":"2023-03-17T10:38:27Z","title":"Disentangling the Link Between Image Statistics and Human Perception","summary":"  In the 1950s, Barlow and Attneave hypothesised a link between biological\nvision and information maximisation. Following Shannon, information was defined\nusing the probability of natural images. A number of physiological and\npsychophysical phenomena have been derived ever since from principles like\ninfo-max, efficient coding, or optimal denoising. However, it remains unclear\nhow this link is expressed in mathematical terms from image probability. First,\nclassical derivations were subjected to strong assumptions on the probability\nmodels and on the behaviour of the sensors. Moreover, the direct evaluation of\nthe hypothesis was limited by the inability of the classical image models to\ndeliver accurate estimates of the probability. In this work we directly\nevaluate image probabilities using an advanced generative model for natural\nimages, and we analyse how probability-related factors can be combined to\npredict human perception via sensitivity of state-of-the-art subjective image\nquality metrics. We use information theory and regression analysis to find a\ncombination of just two probability-related factors that achieves 0.8\ncorrelation with subjective metrics. This probability-based sensitivity is\npsychophysically validated by reproducing the basic trends of the Contrast\nSensitivity Function, its suprathreshold variation, and trends of the Weber-law\nand masking.\n","authors":["Alexander Hepburn","Valero Laparra","Raúl Santos-Rodriguez","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2303.09874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03546v1","updated":"2023-10-05T13:57:53Z","published":"2023-10-05T13:57:53Z","title":"Plug-and-Play Posterior Sampling under Mismatched Measurement and Prior\n  Models","summary":"  Posterior sampling has been shown to be a powerful Bayesian approach for\nsolving imaging inverse problems. The recent plug-and-play unadjusted Langevin\nalgorithm (PnP-ULA) has emerged as a promising method for Monte Carlo sampling\nand minimum mean squared error (MMSE) estimation by combining physical\nmeasurement models with deep-learning priors specified using image denoisers.\nHowever, the intricate relationship between the sampling distribution of\nPnP-ULA and the mismatched data-fidelity and denoiser has not been\ntheoretically analyzed. We address this gap by proposing a posterior-L2\npseudometric and using it to quantify an explicit error bound for PnP-ULA under\nmismatched posterior distribution. We numerically validate our theory on\nseveral inverse problems such as sampling from Gaussian mixture models and\nimage deblurring. Our results suggest that the sensitivity of the sampling\ndistribution of PnP-ULA to a mismatch in the measurement model and the denoiser\ncan be precisely characterized.\n","authors":["Marien Renaud","Jiaming Liu","Valentin de Bortoli","Andrés Almansa","Ulugbek S. Kamilov"],"pdf_url":"https://arxiv.org/pdf/2310.03546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03545v1","updated":"2023-10-05T13:57:24Z","published":"2023-10-05T13:57:24Z","title":"Distribution-free risk assessment of regression-based machine learning\n  algorithms","summary":"  Machine learning algorithms have grown in sophistication over the years and\nare increasingly deployed for real-life applications. However, when using\nmachine learning techniques in practical settings, particularly in high-risk\napplications such as medicine and engineering, obtaining the failure\nprobability of the predictive model is critical. We refer to this problem as\nthe risk-assessment task. We focus on regression algorithms and the\nrisk-assessment task of computing the probability of the true label lying\ninside an interval defined around the model's prediction. We solve the\nrisk-assessment problem using the conformal prediction approach, which provides\nprediction intervals that are guaranteed to contain the true label with a given\nprobability. Using this coverage property, we prove that our approximated\nfailure probability is conservative in the sense that it is not lower than the\ntrue failure probability of the ML algorithm. We conduct extensive experiments\nto empirically study the accuracy of the proposed method for problems with and\nwithout covariate shift. Our analysis focuses on different modeling regimes,\ndataset sizes, and conformal prediction methodologies.\n","authors":["Sukrita Singh","Neeraj Sarna","Yuanyuan Li","Yang Li","Agni Orfanoudaki","Michael Berger"],"pdf_url":"https://arxiv.org/pdf/2310.03545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04333v2","updated":"2023-10-05T13:54:21Z","published":"2023-07-10T03:59:42Z","title":"Enhancing Adversarial Robustness via Score-Based Optimization","summary":"  Adversarial attacks have the potential to mislead deep neural network\nclassifiers by introducing slight perturbations. Developing algorithms that can\nmitigate the effects of these attacks is crucial for ensuring the safe use of\nartificial intelligence. Recent studies have suggested that score-based\ndiffusion models are effective in adversarial defenses. However, existing\ndiffusion-based defenses rely on the sequential simulation of the reversed\nstochastic differential equations of diffusion models, which are\ncomputationally inefficient and yield suboptimal results. In this paper, we\nintroduce a novel adversarial defense scheme named ScoreOpt, which optimizes\nadversarial samples at test-time, towards original clean data in the direction\nguided by score-based priors. We conduct comprehensive experiments on multiple\ndatasets, including CIFAR10, CIFAR100 and ImageNet. Our experimental results\ndemonstrate that our approach outperforms existing adversarial defenses in\nterms of both robustness performance and inference speed.\n","authors":["Boya Zhang","Weijian Luo","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.04333v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.03530v1","updated":"2023-10-05T13:30:37Z","published":"2023-10-05T13:30:37Z","title":"Joint Group Invariant Functions on Data-Parameter Domain Induce\n  Universal Neural Networks","summary":"  The symmetry and geometry of input data are considered to be encoded in the\ninternal data representation inside the neural network, but the specific\nencoding rule has been less investigated. By focusing on a joint group\ninvariant function on the data-parameter domain, we present a systematic rule\nto find a dual group action on the parameter domain from a group action on the\ndata domain. Further, we introduce generalized neural networks induced from the\njoint invariant functions, and present a new group theoretic proof of their\nuniversality theorems by using Schur's lemma. Since traditional universality\ntheorems were demonstrated based on functional analytical methods, this study\nsheds light on the group theoretic aspect of the approximation theory,\nconnecting geometric deep learning to abstract harmonic analysis.\n","authors":["Sho Sonoda","Hideyuki Ishi","Isao Ishikawa","Masahiro Ikeda"],"pdf_url":"https://arxiv.org/pdf/2310.03530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03529v1","updated":"2023-10-05T13:29:46Z","published":"2023-10-05T13:29:46Z","title":"Deep Ridgelet Transform: Voice with Koopman Operator Proves Universality\n  of Formal Deep Networks","summary":"  We identify hidden layers inside a DNN with group actions on the data space,\nand formulate the DNN as a dual voice transform with respect to Koopman\noperator, a linear representation of the group action. Based on the group\ntheoretic arguments, particularly by using Schur's lemma, we show a simple\nproof of the universality of those DNNs.\n","authors":["Sho Sonoda","Yuka Hashimoto","Isao Ishikawa","Masahiro Ikeda"],"pdf_url":"https://arxiv.org/pdf/2310.03529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08979v2","updated":"2023-10-05T13:27:12Z","published":"2023-04-18T13:20:45Z","title":"In ChatGPT We Trust? Measuring and Characterizing the Reliability of\n  ChatGPT","summary":"  The way users acquire information is undergoing a paradigm shift with the\nadvent of ChatGPT. Unlike conventional search engines, ChatGPT retrieves\nknowledge from the model itself and generates answers for users. ChatGPT's\nimpressive question-answering (QA) capability has attracted more than 100\nmillion users within a short period of time but has also raised concerns\nregarding its reliability. In this paper, we perform the first large-scale\nmeasurement of ChatGPT's reliability in the generic QA scenario with a\ncarefully curated set of 5,695 questions across ten datasets and eight domains.\nWe find that ChatGPT's reliability varies across different domains, especially\nunderperforming in law and science questions. We also demonstrate that system\nroles, originally designed by OpenAI to allow users to steer ChatGPT's\nbehavior, can impact ChatGPT's reliability in an imperceptible way. We further\nshow that ChatGPT is vulnerable to adversarial examples, and even a single\ncharacter change can negatively affect its reliability in certain cases. We\nbelieve that our study provides valuable insights into ChatGPT's reliability\nand underscores the need for strengthening the reliability and security of\nlarge language models (LLMs).\n","authors":["Xinyue Shen","Zeyuan Chen","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.08979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02787v3","updated":"2023-10-05T13:00:34Z","published":"2023-02-06T14:02:28Z","title":"Generative models for two-ground-truth partitions in networks","summary":"  A myriad of approaches have been proposed to characterise the mesoscale\nstructure of networks - most often as a partition based on patterns variously\ncalled communities, blocks, or clusters. Clearly, distinct methods designed to\ndetect different types of patterns may provide a variety of answers to the\nnetwork's mesoscale structure. Yet, even multiple runs of a given method can\nsometimes yield diverse and conflicting results, producing entire landscapes of\npartitions which potentially include multiple (locally optimal) mesoscale\nexplanations of the network. Such ambiguity motivates a closer look at the\nability of these methods to find multiple qualitatively different 'ground\ntruth' partitions in a network. Here, we propose the stochastic cross-block\nmodel (SCBM), a generative model which allows for two distinct partitions to be\nbuilt into the mesoscale structure of a single benchmark network. We\ndemonstrate a use case of the benchmark model by appraising the power of\nstochastic block models (SBMs) to detect implicitly planted coexisting\nbi-community and core-periphery structures of different strengths. Given our\nmodel design and experimental set-up, we find that the ability to detect the\ntwo partitions individually varies by SBM variant and that coexistence of both\npartitions is recovered only in a very limited number of cases. Our findings\nsuggest that in most instances only one - in some way dominating - structure\ncan be detected, even in the presence of other partitions. They underline the\nneed for considering entire landscapes of partitions when different competing\nexplanations exist and motivate future research to advance partition\ncoexistence detection methods. Our model also contributes to the field of\nbenchmark networks more generally by enabling further exploration of the\nability of new and existing methods to detect ambiguity in the mesoscale\nstructure of networks.\n","authors":["Lena Mangold","Camille Roth"],"pdf_url":"https://arxiv.org/pdf/2302.02787v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00436v3","updated":"2023-10-05T12:59:59Z","published":"2023-08-01T10:31:36Z","title":"SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step\n  Reasoning","summary":"  The recent progress in large language models (LLMs), especially the invention\nof chain-of-thought prompting, has made it possible to automatically answer\nquestions by stepwise reasoning. However, when faced with more complicated\nproblems that require non-linear thinking, even the strongest LLMs make\nmistakes. To address this, we explore whether LLMs are able to recognize errors\nin their own step-by-step reasoning, without resorting to external resources.\nTo this end, we propose SelfCheck, a general-purpose zero-shot verification\nschema for recognizing such errors. We then use the results of these checks to\nimprove question-answering performance by conducting weighted voting on\nmultiple solutions to the question. We test SelfCheck on three datasets (GSM8K,\nMathQA, and MATH) and find that it successfully recognizes errors and, in turn,\nincreases final answer accuracies.\n","authors":["Ning Miao","Yee Whye Teh","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2308.00436v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03515v1","updated":"2023-10-05T12:52:27Z","published":"2023-10-05T12:52:27Z","title":"High-dimensional Bayesian Optimization with Group Testing","summary":"  Bayesian optimization is an effective method for optimizing\nexpensive-to-evaluate black-box functions. High-dimensional problems are\nparticularly challenging as the surrogate model of the objective suffers from\nthe curse of dimensionality, which makes accurate modeling difficult. We\npropose a group testing approach to identify active variables to facilitate\nefficient optimization in these domains. The proposed algorithm, Group Testing\nBayesian Optimization (GTBO), first runs a testing phase where groups of\nvariables are systematically selected and tested on whether they influence the\nobjective. To that end, we extend the well-established theory of group testing\nto functions of continuous ranges. In the second phase, GTBO guides\noptimization by placing more importance on the active dimensions. By exploiting\nthe axis-aligned subspace assumption, GTBO is competitive against\nstate-of-the-art methods on several synthetic and real-world high-dimensional\noptimization tasks. Furthermore, GTBO aids in the discovery of active\nparameters in applications, thereby enhancing practitioners' understanding of\nthe problem at hand.\n","authors":["Erik Orm Hellsten","Carl Hvarfner","Leonard Papenmeier","Luigi Nardi"],"pdf_url":"https://arxiv.org/pdf/2310.03515v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.17329v2","updated":"2023-10-05T12:52:09Z","published":"2023-09-29T15:40:58Z","title":"Efficient Anatomical Labeling of Pulmonary Tree Structures via Implicit\n  Point-Graph Networks","summary":"  Pulmonary diseases rank prominently among the principal causes of death\nworldwide. Curing them will require, among other things, a better understanding\nof the many complex 3D tree-shaped structures within the pulmonary system, such\nas airways, arteries, and veins. In theory, they can be modeled using\nhigh-resolution image stacks. Unfortunately, standard CNN approaches operating\non dense voxel grids are prohibitively expensive. To remedy this, we introduce\na point-based approach that preserves graph connectivity of tree skeleton and\nincorporates an implicit surface representation. It delivers SOTA accuracy at a\nlow computational cost and the resulting models have usable surfaces. Due to\nthe scarcity of publicly accessible data, we have also curated an extensive\ndataset to evaluate our approach and will make it public.\n","authors":["Kangxian Xie","Jiancheng Yang","Donglai Wei","Ziqiao Weng","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2309.17329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03512v1","updated":"2023-10-05T12:46:56Z","published":"2023-10-05T12:46:56Z","title":"Otago Exercises Monitoring for Older Adults by a Single IMU and\n  Hierarchical Machine Learning Models","summary":"  Otago Exercise Program (OEP) is a rehabilitation program for older adults to\nimprove frailty, sarcopenia, and balance. Accurate monitoring of patient\ninvolvement in OEP is challenging, as self-reports (diaries) are often\nunreliable. With the development of wearable sensors, Human Activity\nRecognition (HAR) systems using wearable sensors have revolutionized\nhealthcare. However, their usage for OEP still shows limited performance. The\nobjective of this study is to build an unobtrusive and accurate system to\nmonitor OEP for older adults. Data was collected from older adults wearing a\nsingle waist-mounted Inertial Measurement Unit (IMU). Two datasets were\ncollected, one in a laboratory setting, and one at the homes of the patients. A\nhierarchical system is proposed with two stages: 1) using a deep learning model\nto recognize whether the patients are performing OEP or activities of daily\nlife (ADLs) using a 10-minute sliding window; 2) based on stage 1, using a\n6-second sliding window to recognize the OEP sub-classes performed. The results\nshowed that in stage 1, OEP could be recognized with window-wise f1-scores over\n0.95 and Intersection-over-Union (IoU) f1-scores over 0.85 for both datasets.\nIn stage 2, for the home scenario, four activities could be recognized with\nf1-scores over 0.8: ankle plantarflexors, abdominal muscles, knee bends, and\nsit-to-stand. The results showed the potential of monitoring the compliance of\nOEP using a single IMU in daily life. Also, some OEP sub-classes are possible\nto be recognized for further analysis.\n","authors":["Meng Shang","Lenore Dedeyne","Jolan Dupont","Laura Vercauteren","Nadjia Amini","Laurence Lapauw","Evelien Gielen","Sabine Verschueren","Carolina Varon","Walter De Raedt","Bart Vanrumste"],"pdf_url":"https://arxiv.org/pdf/2310.03512v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2310.02964v2","updated":"2023-10-05T12:42:25Z","published":"2023-10-04T16:58:25Z","title":"Co-modeling the Sequential and Graphical Routes for Peptide\n  Representation Learning","summary":"  Peptides are formed by the dehydration condensation of multiple amino acids.\nThe primary structure of a peptide can be represented either as an amino acid\nsequence or as a molecular graph consisting of atoms and chemical bonds.\nPrevious studies have indicated that deep learning routes specific to\nsequential and graphical peptide forms exhibit comparable performance on\ndownstream tasks. Despite the fact that these models learn representations of\nthe same modality of peptides, we find that they explain their predictions\ndifferently. Considering sequential and graphical models as two experts making\ninferences from different perspectives, we work on fusing expert knowledge to\nenrich the learned representations for improving the discriminative\nperformance. To achieve this, we propose a peptide co-modeling method, RepCon,\nwhich employs a contrastive learning-based framework to enhance the mutual\ninformation of representations from decoupled sequential and graphical\nend-to-end models. It considers representations from the sequential encoder and\nthe graphical encoder for the same peptide sample as a positive pair and learns\nto enhance the consistency of representations between positive sample pairs and\nto repel representations between negative pairs. Empirical studies of RepCon\nand other co-modeling methods are conducted on open-source discriminative\ndatasets, including aggregation propensity, retention time, antimicrobial\npeptide prediction, and family classification from Peptide Database. Our\nresults demonstrate the superiority of the co-modeling approach over\nindependent modeling, as well as the superiority of RepCon over other methods\nunder the co-modeling framework. In addition, the attribution on RepCon further\ncorroborates the validity of the approach at the level of model explanation.\n","authors":["Zihan Liu","Ge Wang","Jiaqi Wang","Jiangbin Zheng","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2310.02964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02357v2","updated":"2023-10-05T12:36:19Z","published":"2023-10-03T18:32:34Z","title":"On the definition of toxicity in NLP","summary":"  The fundamental problem in toxicity detection task lies in the fact that the\ntoxicity is ill-defined. This causes us to rely on subjective and vague data in\nmodels' training, which results in non-robust and non-accurate results: garbage\nin - garbage out.\n  This work suggests a new, stress-level-based definition of toxicity designed\nto be objective and context-aware. On par with it, we also describe possible\nways of applying this new definition to dataset creation and model training.\n","authors":["Sergey Berezin","Reza Farahbakhsh","Noel Crespi"],"pdf_url":"https://arxiv.org/pdf/2310.02357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.08417v2","updated":"2023-10-05T12:29:14Z","published":"2021-12-15T19:04:46Z","title":"Characterization of causal ancestral graphs for time series with latent\n  confounders","summary":"  In this paper, we introduce a novel class of graphical models for\nrepresenting time lag specific causal relationships and independencies of\nmultivariate time series with unobserved confounders. We completely\ncharacterize these graphs and show that they constitute proper subsets of the\ncurrently employed model classes. As we show, from the novel graphs one can\nthus draw stronger causal inferences -- without additional assumptions. We\nfurther introduce a graphical representation of Markov equivalence classes of\nthe novel graphs. This graphical representation contains more causal knowledge\nthan what current state-of-the-art causal discovery algorithms learn.\n","authors":["Andreas Gerhardus"],"pdf_url":"https://arxiv.org/pdf/2112.08417v2.pdf","comment":"67 pages (including supplement), 16 figures, accepted at The Annals\n  of Statistics"},{"id":"http://arxiv.org/abs/2310.03500v1","updated":"2023-10-05T12:25:39Z","published":"2023-10-05T12:25:39Z","title":"Deep Generative Models of Music Expectation","summary":"  A prominent theory of affective response to music revolves around the\nconcepts of surprisal and expectation. In prior work, this idea has been\noperationalized in the form of probabilistic models of music which allow for\nprecise computation of song (or note-by-note) probabilities, conditioned on a\n'training set' of prior musical or cultural experiences. To date, however,\nthese models have been limited to compute exact probabilities through\nhand-crafted features or restricted to linear models which are likely not\nsufficient to represent the complex conditional distributions present in music.\nIn this work, we propose to use modern deep probabilistic generative models in\nthe form of a Diffusion Model to compute an approximate likelihood of a musical\ninput sequence. Unlike prior work, such a generative model parameterized by\ndeep neural networks is able to learn complex non-linear features directly from\na training set itself. In doing so, we expect to find that such models are able\nto more accurately represent the 'surprisal' of music for human listeners. From\nthe literature, it is known that there is an inverted U-shaped relationship\nbetween surprisal and the amount human subjects 'like' a given song. In this\nwork we show that pre-trained diffusion models indeed yield musical surprisal\nvalues which exhibit a negative quadratic relationship with measured subject\n'liking' ratings, and that the quality of this relationship is competitive with\nstate of the art methods such as IDyOM. We therefore present this model a\npreliminary step in developing modern deep generative models of music\nexpectation and subjective likability.\n","authors":["Ninon Lizé Masclef","T. Anderson Keller"],"pdf_url":"https://arxiv.org/pdf/2310.03500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02156v2","updated":"2023-10-05T12:24:33Z","published":"2023-10-03T15:43:59Z","title":"Probabilistically Rewired Message-Passing Neural Networks","summary":"  Message-passing graph neural networks (MPNNs) emerged as powerful tools for\nprocessing graph-structured input. However, they operate on a fixed input graph\nstructure, ignoring potential noise and missing information. Furthermore, their\nlocal aggregation mechanism can lead to problems such as over-squashing and\nlimited expressive power in capturing relevant graph structures. Existing\nsolutions to these challenges have primarily relied on heuristic methods, often\ndisregarding the underlying data distribution. Hence, devising principled\napproaches for learning to infer graph structures relevant to the given\nprediction task remains an open challenge. In this work, leveraging recent\nprogress in exact and differentiable $k$-subset sampling, we devise\nprobabilistically rewired MPNNs (PR-MPNNs), which learn to add relevant edges\nwhile omitting less beneficial ones. For the first time, our theoretical\nanalysis explores how PR-MPNNs enhance expressive power, and we identify\nprecise conditions under which they outperform purely randomized approaches.\nEmpirically, we demonstrate that our approach effectively mitigates issues like\nover-squashing and under-reaching. In addition, on established real-world\ndatasets, our method exhibits competitive or superior predictive performance\ncompared to traditional MPNN models and recent graph transformer architectures.\n","authors":["Chendi Qian","Andrei Manolache","Kareem Ahmed","Zhe Zeng","Guy Van den Broeck","Mathias Niepert","Christopher Morris"],"pdf_url":"https://arxiv.org/pdf/2310.02156v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03494v1","updated":"2023-10-05T12:08:12Z","published":"2023-10-05T12:08:12Z","title":"How the level sampling process impacts zero-shot generalisation in deep\n  reinforcement learning","summary":"  A key limitation preventing the wider adoption of autonomous agents trained\nvia deep reinforcement learning (RL) is their limited ability to generalise to\nnew environments, even when these share similar characteristics with\nenvironments encountered during training. In this work, we investigate how a\nnon-uniform sampling strategy of individual environment instances, or levels,\naffects the zero-shot generalisation (ZSG) ability of RL agents, considering\ntwo failure modes: overfitting and over-generalisation. As a first step, we\nmeasure the mutual information (MI) between the agent's internal representation\nand the set of training levels, which we find to be well-correlated to instance\noverfitting. In contrast to uniform sampling, adaptive sampling strategies\nprioritising levels based on their value loss are more effective at maintaining\nlower MI, which provides a novel theoretical justification for this class of\ntechniques. We then turn our attention to unsupervised environment design (UED)\nmethods, which adaptively generate new training levels and minimise MI more\neffectively than methods sampling from a fixed set. However, we find UED\nmethods significantly shift the training distribution, resulting in\nover-generalisation and worse ZSG performance over the distribution of\ninterest. To prevent both instance overfitting and over-generalisation, we\nintroduce self-supervised environment design (SSED). SSED generates levels\nusing a variational autoencoder, effectively reducing MI while minimising the\nshift with the distribution of interest, and leads to statistically significant\nimprovements in ZSG over fixed-set level sampling strategies and UED methods.\n","authors":["Samuel Garcin","James Doran","Shangmin Guo","Christopher G. Lucas","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2310.03494v1.pdf","comment":"Currently under review, 9 pages"},{"id":"http://arxiv.org/abs/2204.05923v3","updated":"2023-10-05T12:04:51Z","published":"2022-04-12T16:27:49Z","title":"An Algebraically Converging Stochastic Gradient Descent Algorithm for\n  Global Optimization","summary":"  We propose a new gradient descent algorithm with added stochastic terms for\nfinding the global optimizers of nonconvex optimization problems. A key\ncomponent in the algorithm is the adaptive tuning of the randomness based on\nthe value of the objective function. In the language of simulated annealing,\nthe temperature is state-dependent. With this, we prove the global convergence\nof the algorithm with an algebraic rate both in probability and in the\nparameter space. This is a significant improvement over the classical rate from\nusing a more straightforward control of the noise term. The convergence proof\nis based on the actual discrete setup of the algorithm, not just its continuous\nlimit as often done in the literature. We also present several numerical\nexamples to demonstrate the efficiency and robustness of the algorithm for\nreasonably complex objective functions.\n","authors":["Björn Engquist","Kui Ren","Yunan Yang"],"pdf_url":"https://arxiv.org/pdf/2204.05923v3.pdf","comment":"30 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.03491v1","updated":"2023-10-05T12:02:51Z","published":"2023-10-05T12:02:51Z","title":"TPDR: A Novel Two-Step Transformer-based Product and Class Description\n  Match and Retrieval Method","summary":"  There is a niche of companies responsible for intermediating the purchase of\nlarge batches of varied products for other companies, for which the main\nchallenge is to perform product description standardization, i.e., matching an\nitem described by a client with a product described in a catalog. The problem\nis complex since the client's product description may be: (1) potentially\nnoisy; (2) short and uninformative (e.g., missing information about model and\nsize); and (3) cross-language. In this paper, we formalize this problem as a\nranking task: given an initial client product specification (query), return the\nmost appropriate standardized descriptions (response). In this paper, we\npropose TPDR, a two-step Transformer-based Product and Class Description\nRetrieval method that is able to explore the semantic correspondence between IS\nand SD, by exploiting attention mechanisms and contrastive learning. First,\nTPDR employs the transformers as two encoders sharing the embedding vector\nspace: one for encoding the IS and another for the SD, in which corresponding\npairs (IS, SD) must be close in the vector space. Closeness is further enforced\nby a contrastive learning mechanism leveraging a specialized loss function.\nTPDR also exploits a (second) re-ranking step based on syntactic features that\nare very important for the exact matching (model, dimension) of certain\nproducts that may have been neglected by the transformers. To evaluate our\nproposal, we consider 11 datasets from a real company, covering different\napplication contexts. Our solution was able to retrieve the correct\nstandardized product before the 5th ranking position in 71% of the cases and\nits correct category in the first position in 80% of the situations. Moreover,\nthe effectiveness gains over purely syntactic or semantic baselines reach up to\n3.7 times, solving cases that none of the approaches in isolation can do by\nthemselves.\n","authors":["Washington Cunha","Celso França","Leonardo Rocha","Marcos André Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2310.03491v1.pdf","comment":"10 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.15871v3","updated":"2023-10-05T12:01:42Z","published":"2023-05-25T09:06:26Z","title":"Learning Robust Statistics for Simulation-based Inference under Model\n  Misspecification","summary":"  Simulation-based inference (SBI) methods such as approximate Bayesian\ncomputation (ABC), synthetic likelihood, and neural posterior estimation (NPE)\nrely on simulating statistics to infer parameters of intractable likelihood\nmodels. However, such methods are known to yield untrustworthy and misleading\ninference outcomes under model misspecification, thus hindering their\nwidespread applicability. In this work, we propose the first general approach\nto handle model misspecification that works across different classes of SBI\nmethods. Leveraging the fact that the choice of statistics determines the\ndegree of misspecification in SBI, we introduce a regularized loss function\nthat penalises those statistics that increase the mismatch between the data and\nthe model. Taking NPE and ABC as use cases, we demonstrate the superior\nperformance of our method on high-dimensional time-series models that are\nartificially misspecified. We also apply our method to real data from the field\nof radio propagation where the model is known to be misspecified. We show\nempirically that the method yields robust inference in misspecified scenarios,\nwhilst still being accurate when the model is well-specified.\n","authors":["Daolang Huang","Ayush Bharti","Amauri Souza","Luigi Acerbi","Samuel Kaski"],"pdf_url":"https://arxiv.org/pdf/2305.15871v3.pdf","comment":"22 pages, 13 figures, Published at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.03485v1","updated":"2023-10-05T11:56:06Z","published":"2023-10-05T11:56:06Z","title":"BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic\n  Classification","summary":"  Brain tumors pose significant health challenges worldwide, with glioblastoma\nbeing one of the most aggressive forms. Accurate determination of the\nO6-methylguanine-DNA methyltransferase (MGMT) promoter methylation status is\ncrucial for personalized treatment strategies. However, traditional methods are\nlabor-intensive and time-consuming. This paper proposes a novel multi-modal\napproach, BTDNet, leveraging multi-parametric MRI scans, including FLAIR, T1w,\nT1wCE, and T2 3D volumes, to predict MGMT promoter methylation status. BTDNet\naddresses two main challenges: the variable volume lengths (i.e., each volume\nconsists of a different number of slices) and the volume-level annotations\n(i.e., the whole 3D volume is annotated and not the independent slices that it\nconsists of). BTDNet consists of four components: i) the data augmentation one\n(that performs geometric transformations, convex combinations of data pairs and\ntest-time data augmentation); ii) the 3D analysis one (that performs global\nanalysis through a CNN-RNN); iii) the routing one (that contains a mask layer\nthat handles variable input feature lengths), and iv) the modality fusion one\n(that effectively enhances data representation, reduces ambiguities and\nmitigates data scarcity). The proposed method outperforms by large margins the\nstate-of-the-art methods in the RSNA-ASNR-MICCAI BraTS 2021 Challenge, offering\na promising avenue for enhancing brain tumor diagnosis and treatment.\n","authors":["Dimitrios Kollias","Karanjot Vendal","Priyanka Gadhavi","Solomon Russom"],"pdf_url":"https://arxiv.org/pdf/2310.03485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03482v1","updated":"2023-10-05T11:54:07Z","published":"2023-10-05T11:54:07Z","title":"The Geometric Structure of Fully-Connected ReLU-Layers","summary":"  We formalize and interpret the geometric structure of $d$-dimensional fully\nconnected ReLU-layers in neural networks. The parameters of a ReLU-layer induce\na natural partition of the input domain, such that in each sector of the\npartition, the ReLU-layer can be greatly simplified. This leads to a geometric\ninterpretation of a ReLU-layer as a projection onto a polyhedral cone followed\nby an affine transformation, in line with the description in\n[doi:10.48550/arXiv.1905.08922] for convolutional networks with ReLU\nactivations. Further, this structure facilitates simplified expressions for\npreimages of the intersection between partition sectors and hyperplanes, which\nis useful when describing decision boundaries in a classification setting. We\ninvestigate this in detail for a feed-forward network with one hidden\nReLU-layer, where we provide results on the geometric complexity of the\ndecision boundary generated by such networks, as well as proving that modulo an\naffine transformation, such a network can only generate $d$ different decision\nboundaries. Finally, the effect of adding more layers to the network is\ndiscussed.\n","authors":["Jonatan Vallin","Karl Larsson","Mats G. Larson"],"pdf_url":"https://arxiv.org/pdf/2310.03482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03480v1","updated":"2023-10-05T11:46:32Z","published":"2023-10-05T11:46:32Z","title":"The Cadenza ICASSP 2024 Grand Challenge","summary":"  The Cadenza project aims to enhance the audio quality of music for\nindividuals with hearing loss. As part of this, the project is organizing the\nICASSP SP Cadenza Challenge: Music Demixing/Remixing for Hearing Aids. The\nchallenge can be tackled by decomposing the music at the hearing aid\nmicrophones into vocals, bass, drums, and other components. These can then be\nintelligently remixed in a personalized manner to improve audio quality.\nAlternatively, an end-to-end approach could be used. Processes need to consider\nthe music itself, the gain applied to each component, and the listener's\nhearing loss. The submitted entries will be evaluated using the intrusive\nobjective metric, the Hearing Aid Audio Quality Index (HAAQI). This paper\noutlines the challenge.\n","authors":["Gerardo Roa Dabike","Michael A. Akeroyd","Scott Bannister","Jon Barker","Trevor J. Cox","Bruno Fazenda","Jennifer Firth","Simone Graetzer","Alinka Greasley","Rebecca Vos","William Whitmer"],"pdf_url":"https://arxiv.org/pdf/2310.03480v1.pdf","comment":"2 pages paper for ICASSP 2024 SP Grand Challenge"},{"id":"http://arxiv.org/abs/2310.03466v1","updated":"2023-10-05T11:21:49Z","published":"2023-10-05T11:21:49Z","title":"The Blame Problem in Evaluating Local Explanations, and How to Tackle it","summary":"  The number of local model-agnostic explanation techniques proposed has grown\nrapidly recently. One main reason is that the bar for developing new\nexplainability techniques is low due to the lack of optimal evaluation\nmeasures. Without rigorous measures, it is hard to have concrete evidence of\nwhether the new explanation techniques can significantly outperform their\npredecessors. Our study proposes a new taxonomy for evaluating local\nexplanations: robustness, evaluation using ground truth from synthetic datasets\nand interpretable models, model randomization, and human-grounded evaluation.\nUsing this proposed taxonomy, we highlight that all categories of evaluation\nmethods, except those based on the ground truth from interpretable models,\nsuffer from a problem we call the \"blame problem.\" In our study, we argue that\nthis category of evaluation measure is a more reasonable method for evaluating\nlocal model-agnostic explanations. However, we show that even this category of\nevaluation measures has further limitations. The evaluation of local\nexplanations remains an open research problem.\n","authors":["Amir Hossein Akhavan Rahnama"],"pdf_url":"https://arxiv.org/pdf/2310.03466v1.pdf","comment":"Accepted at Workshop: XAI methods, challenges and applications, 26th\n  European Conference on Artificial Intelligence (ECAI 2023)"},{"id":"http://arxiv.org/abs/2303.10650v4","updated":"2023-10-05T11:17:08Z","published":"2023-03-19T13:03:51Z","title":"Logic of Differentiable Logics: Towards a Uniform Semantics of DL","summary":"  Differentiable logics (DL) have recently been proposed as a method of\ntraining neural networks to satisfy logical specifications. A DL consists of a\nsyntax in which specifications are stated and an interpretation function that\ntranslates expressions in the syntax into loss functions. These loss functions\ncan then be used during training with standard gradient descent algorithms. The\nvariety of existing DLs and the differing levels of formality with which they\nare treated makes a systematic comparative study of their properties and\nimplementations difficult. This paper remedies this problem by suggesting a\nmeta-language for defining DLs that we call the Logic of Differentiable Logics,\nor LDL. Syntactically, it generalises the syntax of existing DLs to FOL, and\nfor the first time introduces the formalism for reasoning about vectors and\nlearners. Semantically, it introduces a general interpretation function that\ncan be instantiated to define loss functions arising from different existing\nDLs. We use LDL to establish several theoretical properties of existing DLs,\nand to conduct their empirical study in neural network verification.\n","authors":["Natalia Ślusarz","Ekaterina Komendantskaya","Matthew L. Daggitt","Robert Stewart","Kathrin Stark"],"pdf_url":"https://arxiv.org/pdf/2303.10650v4.pdf","comment":"LPAR'23"},{"id":"http://arxiv.org/abs/2309.17348v3","updated":"2023-10-05T11:15:40Z","published":"2023-09-29T15:55:17Z","title":"Efficient Biologically Plausible Adversarial Training","summary":"  Artificial Neural Networks (ANNs) trained with Backpropagation (BP) show\nastounding performance and are increasingly often used in performing our daily\nlife tasks. However, ANNs are highly vulnerable to adversarial attacks, which\nalter inputs with small targeted perturbations that drastically disrupt the\nmodels' performance. The most effective method to make ANNs robust against\nthese attacks is adversarial training, in which the training dataset is\naugmented with exemplary adversarial samples. Unfortunately, this approach has\nthe drawback of increased training complexity since generating adversarial\nsamples is very computationally demanding. In contrast to ANNs, humans are not\nsusceptible to adversarial attacks. Therefore, in this work, we investigate\nwhether biologically-plausible learning algorithms are more robust against\nadversarial attacks than BP. In particular, we present an extensive comparative\nanalysis of the adversarial robustness of BP and Present the Error to Perturb\nthe Input To modulate Activity (PEPITA), a recently proposed\nbiologically-plausible learning algorithm, on various computer vision tasks. We\nobserve that PEPITA has higher intrinsic adversarial robustness and, with\nadversarial training, has a more favourable natural-vs-adversarial performance\ntrade-off as, for the same natural accuracies, PEPITA's adversarial accuracies\ndecrease in average by 0.26% and BP's by 8.05%.\n","authors":["Matilde Tristany Farinha","Thomas Ortner","Giorgia Dellaferrera","Benjamin Grewe","Angeliki Pantazi"],"pdf_url":"https://arxiv.org/pdf/2309.17348v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03461v1","updated":"2023-10-05T11:09:42Z","published":"2023-10-05T11:09:42Z","title":"Which mode is better for federated learning? Centralized or\n  Decentralized","summary":"  Both centralized and decentralized approaches have shown excellent\nperformance and great application value in federated learning (FL). However,\ncurrent studies do not provide sufficient evidence to show which one performs\nbetter. Although from the optimization perspective, decentralized methods can\napproach the comparable convergence of centralized methods with less\ncommunication, its test performance has always been inefficient in empirical\nstudies. To comprehensively explore their behaviors in FL, we study their\nexcess risks, including the joint analysis of both optimization and\ngeneralization. We prove that on smooth non-convex objectives, 1) centralized\nFL (CFL) always generalizes better than decentralized FL (DFL); 2) from\nperspectives of the excess risk and test error in CFL, adopting partial\nparticipation is superior to full participation; and, 3) there is a necessary\nrequirement for the topology in DFL to avoid performance collapse as the\ntraining scale increases. Based on some simple hardware metrics, we could\nevaluate which framework is better in practice. Extensive experiments are\nconducted on common setups in FL to validate that our theoretical analysis is\ncontextually valid in practical scenarios.\n","authors":["Yan Sun","Li Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.03461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03456v1","updated":"2023-10-05T10:54:33Z","published":"2023-10-05T10:54:33Z","title":"Multi-Resolution Audio-Visual Feature Fusion for Temporal Action\n  Localization","summary":"  Temporal Action Localization (TAL) aims to identify actions' start, end, and\nclass labels in untrimmed videos. While recent advancements using transformer\nnetworks and Feature Pyramid Networks (FPN) have enhanced visual feature\nrecognition in TAL tasks, less progress has been made in the integration of\naudio features into such frameworks. This paper introduces the Multi-Resolution\nAudio-Visual Feature Fusion (MRAV-FF), an innovative method to merge\naudio-visual data across different temporal resolutions. Central to our\napproach is a hierarchical gated cross-attention mechanism, which discerningly\nweighs the importance of audio information at diverse temporal scales. Such a\ntechnique not only refines the precision of regression boundaries but also\nbolsters classification confidence. Importantly, MRAV-FF is versatile, making\nit compatible with existing FPN TAL architectures and offering a significant\nenhancement in performance when audio data is available.\n","authors":["Edward Fish","Jon Weinbren","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2310.03456v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2306.09376v3","updated":"2023-10-05T10:44:36Z","published":"2023-06-15T07:45:43Z","title":"Modularizing while Training: A New Paradigm for Modularizing DNN Models","summary":"  Deep neural network (DNN) models have become increasingly crucial components\nin intelligent software systems. However, training a DNN model is typically\nexpensive in terms of both time and money. To address this issue, researchers\nhave recently focused on reusing existing DNN models - borrowing the idea of\ncode reuse in software engineering. However, reusing an entire model could\ncause extra overhead or inherits the weakness from the undesired\nfunctionalities. Hence, existing work proposes to decompose an already trained\nmodel into modules, i.e., modularizing-after-training, and enable module reuse.\nSince trained models are not built for modularization,\nmodularizing-after-training incurs huge overhead and model accuracy loss. In\nthis paper, we propose a novel approach that incorporates modularization into\nthe model training process, i.e., modularizing-while-training (MwT). We train a\nmodel to be structurally modular through two loss functions that optimize\nintra-module cohesion and inter-module coupling. We have implemented the\nproposed approach for modularizing Convolutional Neural Network (CNN) models in\nthis work. The evaluation results on representative models demonstrate that MwT\noutperforms the state-of-the-art approach. Specifically, the accuracy loss\ncaused by MwT is only 1.13 percentage points, which is 1.76 percentage points\nless than that of the baseline. The kernel retention rate of the modules\ngenerated by MwT is only 14.58%, with a reduction of 74.31% over the\nstate-of-the-art approach. Furthermore, the total time cost required for\ntraining and modularizing is only 108 minutes, half of the baseline.\n","authors":["Binhang Qi","Hailong Sun","Hongyu Zhang","Ruobing Zhao","Xiang Gao"],"pdf_url":"https://arxiv.org/pdf/2306.09376v3.pdf","comment":"Accepted at ICSE'24"},{"id":"http://arxiv.org/abs/2209.12148v2","updated":"2023-10-05T10:37:39Z","published":"2022-09-25T04:56:10Z","title":"Self-Supervised Masked Convolutional Transformer Block for Anomaly\n  Detection","summary":"  Anomaly detection has recently gained increasing attention in the field of\ncomputer vision, likely due to its broad set of applications ranging from\nproduct fault detection on industrial production lines and impending event\ndetection in video surveillance to finding lesions in medical scans. Regardless\nof the domain, anomaly detection is typically framed as a one-class\nclassification task, where the learning is conducted on normal examples only.\nAn entire family of successful anomaly detection methods is based on learning\nto reconstruct masked normal inputs (e.g. patches, future frames, etc.) and\nexerting the magnitude of the reconstruction error as an indicator for the\nabnormality level. Unlike other reconstruction-based methods, we present a\nnovel self-supervised masked convolutional transformer block (SSMCTB) that\ncomprises the reconstruction-based functionality at a core architectural level.\nThe proposed self-supervised block is extremely flexible, enabling information\nmasking at any layer of a neural network and being compatible with a wide range\nof neural architectures. In this work, we extend our previous self-supervised\npredictive convolutional attentive block (SSPCAB) with a 3D masked\nconvolutional layer, a transformer for channel-wise attention, as well as a\nnovel self-supervised objective based on Huber loss. Furthermore, we show that\nour block is applicable to a wider variety of tasks, adding anomaly detection\nin medical images and thermal videos to the previously considered tasks based\non RGB images and surveillance videos. We exhibit the generality and\nflexibility of SSMCTB by integrating it into multiple state-of-the-art neural\nmodels for anomaly detection, bringing forth empirical results that confirm\nconsiderable performance improvements on five benchmarks. We release our code\nand data as open source at: https://github.com/ristea/ssmctb.\n","authors":["Neelu Madan","Nicolae-Catalin Ristea","Radu Tudor Ionescu","Kamal Nasrollahi","Fahad Shahbaz Khan","Thomas B. Moeslund","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2209.12148v2.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2310.03447v1","updated":"2023-10-05T10:34:47Z","published":"2023-10-05T10:34:47Z","title":"FLAIM: AIM-based Synthetic Data Generation in the Federated Setting","summary":"  Preserving individual privacy while enabling collaborative data sharing is\ncrucial for organizations. Synthetic data generation is one solution, producing\nartificial data that mirrors the statistical properties of private data. While\nnumerous techniques have been devised under differential privacy, they\npredominantly assume data is centralized. However, data is often distributed\nacross multiple clients in a federated manner. In this work, we initiate the\nstudy of federated synthetic tabular data generation. Building upon a SOTA\ncentral method known as AIM, we present DistAIM and FLAIM. We show it is\nstraightforward to distribute AIM, extending a recent approach based on secure\nmulti-party computation which necessitates additional overhead, making it less\nsuited to federated scenarios. We then demonstrate that naively federating AIM\ncan lead to substantial degradation in utility under the presence of\nheterogeneity. To mitigate both issues, we propose an augmented FLAIM approach\nthat maintains a private proxy of heterogeneity. We simulate our methods across\na range of benchmark datasets under different degrees of heterogeneity and show\nthis can improve utility while reducing overhead.\n","authors":["Samuel Maddock","Graham Cormode","Carsten Maple"],"pdf_url":"https://arxiv.org/pdf/2310.03447v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2309.14073v2","updated":"2023-10-05T10:26:02Z","published":"2023-09-25T12:07:00Z","title":"Maximum Likelihood Estimation of Latent Variable Structural Equation\n  Models: A Neural Network Approach","summary":"  We propose a graphical structure for structural equation models that is\nstable under marginalization under linearity and Gaussianity assumptions. We\nshow that computing the maximum likelihood estimation of this model is\nequivalent to training a neural network. We implement a GPU-based algorithm\nthat computes the maximum likelihood estimation of these models.\n","authors":["Mehrzad Saremi"],"pdf_url":"https://arxiv.org/pdf/2309.14073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03435v1","updated":"2023-10-05T10:21:31Z","published":"2023-10-05T10:21:31Z","title":"Variational Inference for GARCH-family Models","summary":"  The Bayesian estimation of GARCH-family models has been typically addressed\nthrough Monte Carlo sampling. Variational Inference is gaining popularity and\nattention as a robust approach for Bayesian inference in complex machine\nlearning models; however, its adoption in econometrics and finance is limited.\nThis paper discusses the extent to which Variational Inference constitutes a\nreliable and feasible alternative to Monte Carlo sampling for Bayesian\ninference in GARCH-like models. Through a large-scale experiment involving the\nconstituents of the S&P 500 index, several Variational Inference optimizers, a\nvariety of volatility models, and a case study, we show that Variational\nInference is an attractive, remarkably well-calibrated, and competitive method\nfor Bayesian learning.\n","authors":["Martin Magris","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2310.03435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17260v3","updated":"2023-10-05T10:13:58Z","published":"2023-09-29T14:12:54Z","title":"PlaceNav: Topological Navigation through Place Recognition","summary":"  Recent results suggest that splitting topological navigation into\nrobot-independent and robot-specific components improves navigation performance\nby enabling the robot-independent part to be trained with data collected by\ndifferent robot types. However, the navigation methods are still limited by the\nscarcity of suitable training data and suffer from poor computational scaling.\nIn this work, we present PlaceNav, subdividing the robot-independent part into\nnavigation-specific and generic computer vision components. We utilize visual\nplace recognition for the subgoal selection of the topological navigation\npipeline. This makes subgoal selection more efficient and enables leveraging\nlarge-scale datasets from non-robotics sources, increasing training data\navailability. Bayesian filtering, enabled by place recognition, further\nimproves navigation performance by increasing the temporal consistency of\nsubgoals. Our experimental results verify the design and the new model obtains\na 76% higher success rate in indoor and 23% higher in outdoor navigation tasks\nwith higher computational efficiency.\n","authors":["Lauri Suomela","Jussi Kalliola","Harry Edelman","Joni-Kristian Kämäräinen"],"pdf_url":"https://arxiv.org/pdf/2309.17260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03424v1","updated":"2023-10-05T10:01:32Z","published":"2023-10-05T10:01:32Z","title":"Neural Language Model Pruning for Automatic Speech Recognition","summary":"  We study model pruning methods applied to Transformer-based neural network\nlanguage models for automatic speech recognition. We explore three aspects of\nthe pruning frame work, namely criterion, method and scheduler, analyzing their\ncontribution in terms of accuracy and inference speed. To the best of our\nknowledge, such in-depth analyses on large-scale recognition systems has not\nbeen reported in the literature. In addition, we propose a variant of low-rank\napproximation suitable for incrementally compressing models, and delivering\nmultiple models with varied target sizes. Among other results, we show that a)\ndata-driven pruning outperforms magnitude-driven in several scenarios; b)\nincremental pruning achieves higher accuracy compared to one-shot pruning,\nespecially when targeting smaller sizes; and c) low-rank approximation presents\nthe best trade-off between size reduction and inference speed-up for moderate\ncompression.\n","authors":["Leonardo Emili","Thiago Fraga-Silva","Ernest Pusateri","Markus Nußbaum-Thom","Youssef Oualil"],"pdf_url":"https://arxiv.org/pdf/2310.03424v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.07726v2","updated":"2023-10-05T10:00:30Z","published":"2023-07-15T06:46:40Z","title":"Towards Optimal Neural Networks: the Role of Sample Splitting in\n  Hyperparameter Selection","summary":"  When artificial neural networks have demonstrated exceptional practical\nsuccess in a variety of domains, investigations into their theoretical\ncharacteristics, such as their approximation power, statistical properties, and\ngeneralization performance, have concurrently made significant strides. In this\npaper, we construct a novel theory for understanding the effectiveness of\nneural networks, which offers a perspective distinct from prior research.\nSpecifically, we explore the rationale underlying a common practice during the\nconstruction of neural network models: sample splitting. Our findings indicate\nthat the optimal hyperparameters derived from sample splitting can enable a\nneural network model that asymptotically minimizes the prediction risk. We\nconduct extensive experiments across different application scenarios and\nnetwork architectures, and the results manifest our theory's effectiveness.\n","authors":["Shijin Gong","Xinyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.07726v2.pdf","comment":"32 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.03419v1","updated":"2023-10-05T09:53:22Z","published":"2023-10-05T09:53:22Z","title":"Pre-Training and Fine-Tuning Generative Flow Networks","summary":"  Generative Flow Networks (GFlowNets) are amortized samplers that learn\nstochastic policies to sequentially generate compositional objects from a given\nunnormalized reward distribution. They can generate diverse sets of high-reward\nobjects, which is an important consideration in scientific discovery tasks.\nHowever, as they are typically trained from a given extrinsic reward function,\nit remains an important open challenge about how to leverage the power of\npre-training and train GFlowNets in an unsupervised fashion for efficient\nadaptation to downstream tasks. Inspired by recent successes of unsupervised\npre-training in various domains, we introduce a novel approach for reward-free\npre-training of GFlowNets. By framing the training as a self-supervised\nproblem, we propose an outcome-conditioned GFlowNet (OC-GFN) that learns to\nexplore the candidate space. Specifically, OC-GFN learns to reach any targeted\noutcomes, akin to goal-conditioned policies in reinforcement learning. We show\nthat the pre-trained OC-GFN model can allow for a direct extraction of a policy\ncapable of sampling from any new reward functions in downstream tasks.\nNonetheless, adapting OC-GFN on a downstream task-specific reward involves an\nintractable marginalization over possible outcomes. We propose a novel way to\napproximate this marginalization by learning an amortized predictor enabling\nefficient fine-tuning. Extensive experimental results validate the efficacy of\nour approach, demonstrating the effectiveness of pre-training the OC-GFN, and\nits ability to swiftly adapt to downstream tasks and discover modes more\nefficiently. This work may serve as a foundation for further exploration of\npre-training strategies in the context of GFlowNets.\n","authors":["Ling Pan","Moksh Jain","Kanika Madan","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.03419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13512v2","updated":"2023-10-05T09:45:00Z","published":"2023-06-23T14:27:14Z","title":"DISCO-10M: A Large-Scale Music Dataset","summary":"  Music datasets play a crucial role in advancing research in machine learning\nfor music. However, existing music datasets suffer from limited size,\naccessibility, and lack of audio resources. To address these shortcomings, we\npresent DISCO-10M, a novel and extensive music dataset that surpasses the\nlargest previously available music dataset by an order of magnitude. To ensure\nhigh-quality data, we implement a multi-stage filtering process. This process\nincorporates similarities based on textual descriptions and audio embeddings.\nMoreover, we provide precomputed CLAP embeddings alongside DISCO-10M,\nfacilitating direct application on various downstream tasks. These embeddings\nenable efficient exploration of machine learning applications on the provided\ndata. With DISCO-10M, we aim to democratize and facilitate new research to help\nadvance the development of novel machine learning models for music.\n","authors":["Luca A. Lanzendörfer","Florian Grötschla","Emil Funke","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2306.13512v2.pdf","comment":"NeurIPS 2023 Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2307.11546v2","updated":"2023-10-05T09:34:37Z","published":"2023-07-21T12:47:28Z","title":"Towards practical reinforcement learning for tokamak magnetic control","summary":"  Reinforcement learning (RL) has shown promising results for real-time control\nsystems, including the domain of plasma magnetic control. However, there are\nstill significant drawbacks compared to traditional feedback control approaches\nfor magnetic confinement. In this work, we address key drawbacks of the RL\nmethod; achieving higher control accuracy for desired plasma properties,\nreducing the steady-state error, and decreasing the required time to learn new\ntasks. We build on top of \\cite{degrave2022magnetic}, and present algorithmic\nimprovements to the agent architecture and training procedure. We present\nsimulation results that show up to 65\\% improvement in shape accuracy, achieve\nsubstantial reduction in the long-term bias of the plasma current, and\nadditionally reduce the training time required to learn new tasks by a factor\nof 3 or more. We present new experiments using the upgraded RL-based\ncontrollers on the TCV tokamak, which validate the simulation results achieved,\nand point the way towards routinely achieving accurate discharges using the RL\napproach.\n","authors":["Brendan D. Tracey","Andrea Michi","Yuri Chervonyi","Ian Davies","Cosmin Paduraru","Nevena Lazic","Federico Felici","Timo Ewalds","Craig Donner","Cristian Galperti","Jonas Buchli","Michael Neunert","Andrea Huber","Jonathan Evens","Paula Kurylowicz","Daniel J. Mankowitz","Martin Riedmiller","The TCV Team"],"pdf_url":"https://arxiv.org/pdf/2307.11546v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03410v1","updated":"2023-10-05T09:29:23Z","published":"2023-10-05T09:29:23Z","title":"Over-the-Air Federated Learning with Compressed Sensing: Is\n  Sparsification Necessary?","summary":"  Over-the-Air (OtA) Federated Learning (FL) refers to an FL system where\nmultiple agents apply OtA computation for transmitting model updates to a\ncommon edge server. Two important features of OtA computation, namely linear\nprocessing and signal-level superposition, motivate the use of linear\ncompression with compressed sensing (CS) methods to reduce the number of data\nsamples transmitted over the channel. The previous works on applying CS methods\nin OtA FL have primarily assumed that the original model update vectors are\nsparse, or they have been sparsified before compression. However, it is unclear\nwhether linear compression with CS-based reconstruction is more effective than\ndirectly sending the non-zero elements in the sparsified update vectors, under\nthe same total power constraint. In this study, we examine and compare several\ncommunication designs with or without sparsification. Our findings demonstrate\nthat sparsification before compression is not necessary. Alternatively,\nsparsification without linear compression can also achieve better performance\nthan the commonly considered setup that combines both.\n","authors":["Adrian Edin","Zheng Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03410v1.pdf","comment":"6 pages, 8 figures, submitted for possible conference publication"},{"id":"http://arxiv.org/abs/2310.03406v1","updated":"2023-10-05T09:22:16Z","published":"2023-10-05T09:22:16Z","title":"RUSOpt: Robotic UltraSound Probe Normalization with Bayesian\n  Optimization for In-plane and Out-plane Scanning","summary":"  The one of the significant challenges faced by autonomous robotic ultrasound\nsystems is acquiring high-quality images across different patients. The proper\norientation of the robotized probe plays a crucial role in governing the\nquality of ultrasound images. To address this challenge, we propose a\nsample-efficient method to automatically adjust the orientation of the\nultrasound probe normal to the point of contact on the scanning surface,\nthereby improving the acoustic coupling of the probe and resulting image\nquality. Our method utilizes Bayesian Optimization (BO) based search on the\nscanning surface to efficiently search for the normalized probe orientation. We\nformulate a novel objective function for BO that leverages the contact force\nmeasurements and underlying mechanics to identify the normal. We further\nincorporate a regularization scheme in BO to handle the noisy objective\nfunction. The performance of the proposed strategy has been assessed through\nexperiments on urinary bladder phantoms. These phantoms included planar,\ntilted, and rough surfaces, and were examined using both linear and convex\nprobes with varying search space limits. Further, simulation-based studies have\nbeen carried out using 3D human mesh models. The results demonstrate that the\nmean ($\\pm$SD) absolute angular error averaged over all phantoms and 3D models\nis $\\boldsymbol{2.4\\pm0.7^\\circ}$ and $\\boldsymbol{2.1\\pm1.3^\\circ}$,\nrespectively.\n","authors":["Deepak Raina","Abhishek Mathur","Richard M. Voyles","Juan Wachs","SH Chandrashekhara","Subir Kumar Saha"],"pdf_url":"https://arxiv.org/pdf/2310.03406v1.pdf","comment":"Accepted in IEEE International Conference on Automation Science and\n  Engineering (CASE) 2023"},{"id":"http://arxiv.org/abs/2309.05395v3","updated":"2023-10-05T09:18:13Z","published":"2023-09-11T11:54:42Z","title":"Practical Homomorphic Aggregation for Byzantine ML","summary":"  Due to the large-scale availability of data, machine learning (ML) algorithms\nare being deployed in distributed topologies, where different nodes collaborate\nto train ML models over their individual data by exchanging model-related\ninformation (e.g., gradients) with a central server. However, distributed\nlearning schemes are notably vulnerable to two threats. First, Byzantine nodes\ncan single-handedly corrupt the learning by sending incorrect information to\nthe server, e.g., erroneous gradients. The standard approach to mitigate such\nbehavior is to use a non-linear robust aggregation method at the server.\nSecond, the server can violate the privacy of the nodes. Recent attacks have\nshown that exchanging (unencrypted) gradients enables a curious server to\nrecover the totality of the nodes' data. The use of homomorphic encryption\n(HE), a gold standard security primitive, has extensively been studied as a\nprivacy-preserving solution to distributed learning in non-Byzantine scenarios.\nHowever, due to HE's large computational demand especially for high-dimensional\nML models, there has not yet been any attempt to design purely homomorphic\noperators for non-linear robust aggregators. In this work, we present SABLE,\nthe first completely homomorphic and Byzantine robust distributed learning\nalgorithm. SABLE essentially relies on a novel plaintext encoding method that\nenables us to implement the robust aggregator over batching-friendly BGV.\nMoreover, this encoding scheme also accelerates state-of-the-art homomorphic\nsorting with larger security margins and smaller ciphertext size. We perform\nextensive experiments on image classification tasks and show that our algorithm\nachieves practical execution times while matching the ML performance of its\nnon-private counterpart.\n","authors":["Antoine Choffrut","Rachid Guerraoui","Rafael Pinot","Renaud Sirdey","John Stephan","Martin Zuber"],"pdf_url":"https://arxiv.org/pdf/2309.05395v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03404v1","updated":"2023-10-05T09:14:54Z","published":"2023-10-05T09:14:54Z","title":"EAG-RS: A Novel Explainability-guided ROI-Selection Framework for ASD\n  Diagnosis via Inter-regional Relation Learning","summary":"  Deep learning models based on resting-state functional magnetic resonance\nimaging (rs-fMRI) have been widely used to diagnose brain diseases,\nparticularly autism spectrum disorder (ASD). Existing studies have leveraged\nthe functional connectivity (FC) of rs-fMRI, achieving notable classification\nperformance. However, they have significant limitations, including the lack of\nadequate information while using linear low-order FC as inputs to the model,\nnot considering individual characteristics (i.e., different symptoms or varying\nstages of severity) among patients with ASD, and the non-explainability of the\ndecision process. To cover these limitations, we propose a novel\nexplainability-guided region of interest (ROI) selection (EAG-RS) framework\nthat identifies non-linear high-order functional associations among brain\nregions by leveraging an explainable artificial intelligence technique and\nselects class-discriminative regions for brain disease identification. The\nproposed framework includes three steps: (i) inter-regional relation learning\nto estimate non-linear relations through random seed-based network masking,\n(ii) explainable connection-wise relevance score estimation to explore\nhigh-order relations between functional connections, and (iii) non-linear\nhigh-order FC-based diagnosis-informative ROI selection and classifier learning\nto identify ASD. We validated the effectiveness of our proposed method by\nconducting experiments using the Autism Brain Imaging Database Exchange (ABIDE)\ndataset, demonstrating that the proposed method outperforms other comparative\nmethods in terms of various evaluation metrics. Furthermore, we qualitatively\nanalyzed the selected ROIs and identified ASD subtypes linked to previous\nneuroscientific studies.\n","authors":["Wonsik Jung","Eunjin Jeon","Eunsong Kang","Heung-Il Suk"],"pdf_url":"https://arxiv.org/pdf/2310.03404v1.pdf","comment":"12 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.03400v1","updated":"2023-10-05T09:09:44Z","published":"2023-10-05T09:09:44Z","title":"Adapting Large Language Models for Content Moderation: Pitfalls in Data\n  Engineering and Supervised Fine-tuning","summary":"  Nowadays, billions of people engage in communication and express their\nopinions on the internet daily. Unfortunately, not all of these expressions are\nfriendly or compliant, making content moderation an indispensable task. With\nthe successful development of Large Language Models (LLMs) in recent years,\nLLM-based methods have become a feasible solution for handling tasks in various\ndomains. However, in the field of content moderation, there is still a lack of\ndetailed work that systematically introduces implementation details. In this\npaper, we introduce how to fine-tune an LLM model that can be privately\ndeployed for content moderation. Specifically, we discuss whether incorporating\nreasons during the fine-tuning process would be better or if it should be\ntreated as a classification task directly. We also explore the benefits of\nutilizing reasons generated by more powerful LLMs for fine-tuning privately\ndeployed models and the impact of different processing approaches when the\nanswers generated by the more powerful LLMs are incorrect. We report the entire\nresearch process and the key findings in this paper, hoping to provide valuable\nexperience for researchers who are fine-tuning privately deployed models in\ntheir domain-specific research.\n","authors":["Huan Ma","Changqing Zhang","Huazhu Fu","Peilin Zhao","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2310.03400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03399v1","updated":"2023-10-05T09:08:47Z","published":"2023-10-05T09:08:47Z","title":"GRAPES: Learning to Sample Graphs for Scalable Graph Neural Networks","summary":"  Graph neural networks (GNNs) learn the representation of nodes in a graph by\naggregating the neighborhood information in various ways. As these networks\ngrow in depth, their receptive field grows exponentially due to the increase in\nneighborhood sizes, resulting in high memory costs. Graph sampling solves\nmemory issues in GNNs by sampling a small ratio of the nodes in the graph. This\nway, GNNs can scale to much larger graphs. Most sampling methods focus on fixed\nsampling heuristics, which may not generalize to different structures or tasks.\nWe introduce GRAPES, an adaptive graph sampling method that learns to identify\nsets of influential nodes for training a GNN classifier. GRAPES uses a GFlowNet\nto learn node sampling probabilities given the classification objectives. We\nevaluate GRAPES across several small- and large-scale graph benchmarks and\ndemonstrate its effectiveness in accuracy and scalability. In contrast to\nexisting sampling methods, GRAPES maintains high accuracy even with small\nsample sizes and, therefore, can scale to very large graphs. Our code is\npublicly available at https://github.com/dfdazac/grapes.\n","authors":["Taraneh Younesian","Thiviyan Thanapalasingam","Emile van Krieken","Daniel Daza","Peter Bloem"],"pdf_url":"https://arxiv.org/pdf/2310.03399v1.pdf","comment":"12 pages, 6 appendix, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.03398v1","updated":"2023-10-05T09:04:53Z","published":"2023-10-05T09:04:53Z","title":"Interpolating between Clustering and Dimensionality Reduction with\n  Gromov-Wasserstein","summary":"  We present a versatile adaptation of existing dimensionality reduction (DR)\nobjectives, enabling the simultaneous reduction of both sample and feature\nsizes. Correspondances between input and embedding samples are computed through\na semi-relaxed Gromov-Wasserstein optimal transport (OT) problem. When the\nembedding sample size matches that of the input, our model recovers classical\npopular DR models. When the embedding's dimensionality is unconstrained, we\nshow that the OT plan delivers a competitive hard clustering. We emphasize the\nimportance of intermediate stages that blend DR and clustering for summarizing\nreal data and apply our method to visualize datasets of images.\n","authors":["Hugues Van Assel","Cédric Vincent-Cuaz","Titouan Vayer","Rémi Flamary","Nicolas Courty"],"pdf_url":"https://arxiv.org/pdf/2310.03398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03991v4","updated":"2023-10-05T09:03:58Z","published":"2021-10-08T09:23:03Z","title":"Combining Differential Privacy and Byzantine Resilience in Distributed\n  SGD","summary":"  Privacy and Byzantine resilience (BR) are two crucial requirements of\nmodern-day distributed machine learning. The two concepts have been extensively\nstudied individually but the question of how to combine them effectively\nremains unanswered. This paper contributes to addressing this question by\nstudying the extent to which the distributed SGD algorithm, in the standard\nparameter-server architecture, can learn an accurate model despite (a) a\nfraction of the workers being malicious (Byzantine), and (b) the other\nfraction, whilst being honest, providing noisy information to the server to\nensure differential privacy (DP). We first observe that the integration of\nstandard practices in DP and BR is not straightforward. In fact, we show that\nmany existing results on the convergence of distributed SGD under Byzantine\nfaults, especially those relying on $(\\alpha,f)$-Byzantine resilience, are\nrendered invalid when honest workers enforce DP. To circumvent this\nshortcoming, we revisit the theory of $(\\alpha,f)$-BR to obtain an approximate\nconvergence guarantee. Our analysis provides key insights on how to improve\nthis guarantee through hyperparameter optimization. Essentially, our\ntheoretical and empirical results show that (1) an imprudent combination of\nstandard approaches to DP and BR might be fruitless, but (2) by carefully\nre-tuning the learning algorithm, we can obtain reasonable learning accuracy\nwhile simultaneously guaranteeing DP and BR.\n","authors":["Rachid Guerraoui","Nirupam Gupta","Rafael Pinot","Sebastien Rouault","John Stephan"],"pdf_url":"https://arxiv.org/pdf/2110.03991v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03396v1","updated":"2023-10-05T09:03:51Z","published":"2023-10-05T09:03:51Z","title":"Learning to Simplify Spatial-Temporal Graphs in Gait Analysis","summary":"  Gait analysis leverages unique walking patterns for person identification and\nassessment across multiple domains. Among the methods used for gait analysis,\nskeleton-based approaches have shown promise due to their robust and\ninterpretable features. However, these methods often rely on hand-crafted\nspatial-temporal graphs that are based on human anatomy disregarding the\nparticularities of the dataset and task. This paper proposes a novel method to\nsimplify the spatial-temporal graph representation for gait-based gender\nestimation, improving interpretability without losing performance. Our approach\nemploys two models, an upstream and a downstream model, that can adjust the\nadjacency matrix for each walking instance, thereby removing the fixed nature\nof the graph. By employing the Straight-Through Gumbel-Softmax trick, our model\nis trainable end-to-end. We demonstrate the effectiveness of our approach on\nthe CASIA-B dataset for gait-based gender estimation. The resulting graphs are\ninterpretable and differ qualitatively from fixed graphs used in existing\nmodels. Our research contributes to enhancing the explainability and\ntask-specific adaptability of gait recognition, promoting more efficient and\nreliable gait-based biometrics.\n","authors":["Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2310.03396v1.pdf","comment":"5 Figures, 1 Table. Short Paper"},{"id":"http://arxiv.org/abs/2310.03393v1","updated":"2023-10-05T09:00:48Z","published":"2023-10-05T09:00:48Z","title":"Uncertainty quantification for deep learning-based schemes for solving\n  high-dimensional backward stochastic differential equations","summary":"  Deep learning-based numerical schemes for solving high-dimensional backward\nstochastic differential equations (BSDEs) have recently raised plenty of\nscientific interest. While they enable numerical methods to approximate very\nhigh-dimensional BSDEs, their reliability has not been studied and is thus not\nunderstood. In this work, we study uncertainty quantification (UQ) for a class\nof deep learning-based BSDE schemes. More precisely, we review the sources of\nuncertainty involved in the schemes and numerically study the impact of\ndifferent sources. Usually, the standard deviation (STD) of the approximate\nsolutions obtained from multiple runs of the algorithm with different datasets\nis calculated to address the uncertainty. This approach is computationally\nquite expensive, especially for high-dimensional problems. Hence, we develop a\nUQ model that efficiently estimates the STD of the approximate solution using\nonly a single run of the algorithm. The model also estimates the mean of the\napproximate solution, which can be leveraged to initialize the algorithm and\nimprove the optimization process. Our numerical experiments show that the UQ\nmodel produces reliable estimates of the mean and STD of the approximate\nsolution for the considered class of deep learning-based BSDE schemes. The\nestimated STD captures multiple sources of uncertainty, demonstrating its\neffectiveness in quantifying the uncertainty. Additionally, the model\nillustrates the improved performance when comparing different schemes based on\nthe estimated STD values. Furthermore, it can identify hyperparameter values\nfor which the scheme achieves good approximations.\n","authors":["Lorenc Kapllani","Long Teng","Matthias Rottmann"],"pdf_url":"https://arxiv.org/pdf/2310.03393v1.pdf","comment":"41 pages, 23 figures and 15 tables"},{"id":"http://arxiv.org/abs/2010.11559v2","updated":"2023-10-05T08:56:20Z","published":"2020-10-22T09:33:49Z","title":"Learning Graph Laplacian with MCP","summary":"  We consider the problem of learning a graph under the Laplacian constraint\nwith a non-convex penalty: minimax concave penalty (MCP). For solving the MCP\npenalized graphical model, we design an inexact proximal difference-of-convex\nalgorithm (DCA) and prove its convergence to critical points. We note that each\nsubproblem of the proximal DCA enjoys the nice property that the objective\nfunction in its dual problem is continuously differentiable with a semismooth\ngradient. Therefore, we apply an efficient semismooth Newton method to\nsubproblems of the proximal DCA. Numerical experiments on various synthetic and\nreal data sets demonstrate the effectiveness of the non-convex penalty MCP in\npromoting sparsity. Compared with the existing state-of-the-art method, our\nmethod is demonstrated to be more efficient and reliable for learning graph\nLaplacian with MCP.\n","authors":["Yangjing Zhang","Kim-Chuan Toh","Defeng Sun"],"pdf_url":"https://arxiv.org/pdf/2010.11559v2.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2310.03388v1","updated":"2023-10-05T08:49:51Z","published":"2023-10-05T08:49:51Z","title":"OpenPatch: a 3D patchwork for Out-Of-Distribution detectionpdf icon","summary":"  Moving deep learning models from the laboratory setting to the open world\nentails preparing them to handle unforeseen conditions. In several applications\nthe occurrence of novel classes during deployment poses a significant threat,\nthus it is crucial to effectively detect them. Ideally, this skill should be\nused when needed without requiring any further computational training effort at\nevery new task. Out-of-distribution detection has attracted significant\nattention in the last years, however the majority of the studies deal with 2D\nimages ignoring the inherent 3D nature of the real-world and often confusing\nbetween domain and semantic novelty. In this work, we focus on the latter,\nconsidering the objects geometric structure captured by 3D point clouds\nregardless of the specific domain. We advance the field by introducing\nOpenPatch that builds on a large pre-trained model and simply extracts from its\nintermediate features a set of patch representations that describe each known\nclass. For any new sample, we obtain a novelty score by evaluating whether it\ncan be recomposed mainly by patches of a single known class or rather via the\ncontribution of multiple classes. We present an extensive experimental\nevaluation of our approach for the task of semantic novelty detection on\nreal-world point cloud samples when the reference known data are synthetic. We\ndemonstrate that OpenPatch excels in both the full and few-shot known sample\nscenarios, showcasing its robustness across varying pre-training objectives and\nnetwork backbones. The inherent training-free nature of our method allows for\nits immediate application to a wide array of real-world tasks, offering a\ncompelling advantage over approaches that need expensive retraining efforts.\n","authors":["Paolo Rabino","Antonio Alliegro","Francesco Cappio Borlino","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2310.03388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00143v3","updated":"2023-10-05T08:38:20Z","published":"2023-07-31T20:19:50Z","title":"Formally Explaining Neural Networks within Reactive Systems","summary":"  Deep neural networks (DNNs) are increasingly being used as controllers in\nreactive systems. However, DNNs are highly opaque, which renders it difficult\nto explain and justify their actions. To mitigate this issue, there has been a\nsurge of interest in explainable AI (XAI) techniques, capable of pinpointing\nthe input features that caused the DNN to act as it did. Existing XAI\ntechniques typically face two limitations: (i) they are heuristic, and do not\nprovide formal guarantees that the explanations are correct; and (ii) they\noften apply to ``one-shot'' systems, where the DNN is invoked independently of\npast invocations, as opposed to reactive systems. Here, we begin bridging this\ngap, and propose a formal DNN-verification-based XAI technique for reasoning\nabout multi-step, reactive systems. We suggest methods for efficiently\ncalculating succinct explanations, by exploiting the system's transition\nconstraints in order to curtail the search space explored by the underlying\nverifier. We evaluate our approach on two popular benchmarks from the domain of\nautomated navigation; and observe that our methods allow the efficient\ncomputation of minimal and minimum explanations, significantly outperforming\nthe state of the art. We also demonstrate that our methods produce formal\nexplanations that are more reliable than competing, non-verification-based XAI\ntechniques.\n","authors":["Shahaf Bassan","Guy Amir","Davide Corsi","Idan Refaeli","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2308.00143v3.pdf","comment":"To appear in Proc. 23rd Int. Conf. on Formal Methods in\n  Computer-Aided Design (FMCAD)"},{"id":"http://arxiv.org/abs/2310.03378v1","updated":"2023-10-05T08:29:00Z","published":"2023-10-05T08:29:00Z","title":"Machine learning the interaction network in coupled dynamical systems","summary":"  The study of interacting dynamical systems continues to attract research\ninterest in various fields of science and engineering. In a collection of\ninteracting particles, the interaction network contains information about how\nvarious components interact with one another. Inferring the information about\nthe interaction network from the dynamics of agents is a problem of\nlong-standing interest. In this work, we employ a self-supervised neural\nnetwork model to achieve two outcomes: to recover the interaction network and\nto predict the dynamics of individual agents. Both these information are\ninferred solely from the observed trajectory data. This work presents an\napplication of the Neural Relational Inference model to two dynamical systems:\ncoupled particles mediated by Hooke's law interaction and coupled phase\n(Kuramoto) oscillators.\n","authors":["Pawan R. Bhure","M. S. Santhanam"],"pdf_url":"https://arxiv.org/pdf/2310.03378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03365v1","updated":"2023-10-05T07:48:55Z","published":"2023-10-05T07:48:55Z","title":"Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video\n  Sequences Using Swin Transformer-Enhanced UNet","summary":"  Lung cancer is highly lethal, emphasizing the critical need for early\ndetection. However, identifying lung nodules poses significant challenges for\nradiologists, who rely heavily on their expertise and experience for accurate\ndiagnosis. To address this issue, computer-aided diagnosis systems based on\nmachine learning techniques have emerged to assist doctors in identifying lung\nnodules from computed tomography (CT) scans. Unfortunately, existing networks\nin this domain often suffer from computational complexity, leading to high\nrates of false negatives and false positives, limiting their effectiveness. To\naddress these challenges, we present an innovative model that harnesses the\nstrengths of both convolutional neural networks and vision transformers.\nInspired by object detection in videos, we treat each 3D CT image as a video,\nindividual slices as frames, and lung nodules as objects, enabling a\ntime-series application. The primary objective of our work is to overcome\nhardware limitations during model training, allowing for efficient processing\nof 2D data while utilizing inter-slice information for accurate identification\nbased on 3D image context. We validated the proposed network by applying a\n10-fold cross-validation technique to the publicly available Lung Nodule\nAnalysis 2016 dataset. Our proposed architecture achieves an average\nsensitivity criterion of 97.84% and a competition performance metrics (CPM) of\n96.0% with few parameters. Comparative analysis with state-of-the-art\nadvancements in lung nodule identification demonstrates the significant\naccuracy achieved by our proposed model.\n","authors":["Hossein Jafari","Karim Faez","Hamidreza Amindavar"],"pdf_url":"https://arxiv.org/pdf/2310.03365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03358v1","updated":"2023-10-05T07:29:29Z","published":"2023-10-05T07:29:29Z","title":"Robust Representation Learning via Asymmetric Negative Contrast and\n  Reverse Attention","summary":"  Deep neural networks are vulnerable to adversarial noise. Adversarial\ntraining (AT) has been demonstrated to be the most effective defense strategy\nto protect neural networks from being fooled. However, we find AT omits to\nlearning robust features, resulting in poor performance of adversarial\nrobustness. To address this issue, we highlight two characteristics of robust\nrepresentation: (1) $\\bf{exclusion}$: the feature of natural examples keeps\naway from that of other classes; (2) $\\bf{alignment}$: the feature of natural\nand corresponding adversarial examples is close to each other. These motivate\nus to propose a generic framework of AT to gain robust representation, by the\nasymmetric negative contrast and reverse attention. Specifically, we design an\nasymmetric negative contrast based on predicted probabilities, to push away\nexamples of different classes in the feature space. Moreover, we propose to\nweight feature by parameters of the linear classifier as the reverse attention,\nto obtain class-aware feature and pull close the feature of the same class.\nEmpirical evaluations on three benchmark datasets show our methods greatly\nadvance the robustness of AT and achieve state-of-the-art performance. Code is\navailable at <https://github.com/changzhang777/ANCRA>.\n","authors":["Nuoyan Zhou","Decheng Liu","Dawei Zhou","Xinbo Gao","Nannan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03358v1.pdf","comment":"Submitted to ICLR2024"},{"id":"http://arxiv.org/abs/2309.17167v2","updated":"2023-10-05T07:27:12Z","published":"2023-09-29T12:04:14Z","title":"DyVal: Graph-informed Dynamic Evaluation of Large Language Models","summary":"  Large language models (LLMs) have achieved remarkable performance in various\nevaluation benchmarks. However, concerns about their performance are raised on\npotential data contamination in their considerable volume of training corpus.\nMoreover, the static nature and fixed complexity of current benchmarks may\ninadequately gauge the advancing capabilities of LLMs. In this paper, we\nintroduce DyVal, a novel, general, and flexible evaluation protocol for dynamic\nevaluation of LLMs. Based on our proposed dynamic evaluation framework, we\nbuild graph-informed DyVal by leveraging the structural advantage of directed\nacyclic graphs to dynamically generate evaluation samples with controllable\ncomplexities. DyVal generates challenging evaluation sets on reasoning tasks\nincluding mathematics, logical reasoning, and algorithm problems. We evaluate\nvarious LLMs ranging from Flan-T5-large to ChatGPT and GPT4. Experiments\ndemonstrate that LLMs perform worse in DyVal-generated evaluation samples with\ndifferent complexities, emphasizing the significance of dynamic evaluation. We\nalso analyze the failure cases and results of different prompting methods.\nMoreover, DyVal-generated samples are not only evaluation sets, but also\nhelpful data for fine-tuning to improve the performance of LLMs on existing\nbenchmarks. We hope that DyVal can shed light on the future evaluation research\nof LLMs.\n","authors":["Kaijie Zhu","Jiaao Chen","Jindong Wang","Neil Zhenqiang Gong","Diyi Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2309.17167v2.pdf","comment":"Technical report; 36 pages; code will be released at aka.ms/dyval"},{"id":"http://arxiv.org/abs/2310.03354v1","updated":"2023-10-05T07:19:33Z","published":"2023-10-05T07:19:33Z","title":"Fictitious Cross-Play: Learning Global Nash Equilibrium in Mixed\n  Cooperative-Competitive Games","summary":"  Self-play (SP) is a popular multi-agent reinforcement learning (MARL)\nframework for solving competitive games, where each agent optimizes policy by\ntreating others as part of the environment. Despite the empirical successes,\nthe theoretical properties of SP-based methods are limited to two-player\nzero-sum games. However, for mixed cooperative-competitive games where agents\non the same team need to cooperate with each other, we can show a simple\ncounter-example where SP-based methods cannot converge to a global Nash\nequilibrium (NE) with high probability. Alternatively, Policy-Space Response\nOracles (PSRO) is an iterative framework for learning NE, where the best\nresponses w.r.t. previous policies are learned in each iteration. PSRO can be\ndirectly extended to mixed cooperative-competitive settings by jointly learning\nteam best responses with all convergence properties unchanged. However, PSRO\nrequires repeatedly training joint policies from scratch till convergence,\nwhich makes it hard to scale to complex games. In this work, we develop a novel\nalgorithm, Fictitious Cross-Play (FXP), which inherits the benefits from both\nframeworks. FXP simultaneously trains an SP-based main policy and a counter\npopulation of best response policies. The main policy is trained by fictitious\nself-play and cross-play against the counter population, while the counter\npolicies are trained as the best responses to the main policy's past versions.\nWe validate our method in matrix games and show that FXP converges to global\nNEs while SP methods fail. We also conduct experiments in a gridworld domain,\nwhere FXP achieves higher Elo ratings and lower exploitabilities than\nbaselines, and a more challenging football game, where FXP defeats SOTA models\nwith over 94% win rate.\n","authors":["Zelai Xu","Yancheng Liang","Chao Yu","Yu Wang","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2310.03354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03353v1","updated":"2023-10-05T07:14:34Z","published":"2023-10-05T07:14:34Z","title":"Deep Geometric Learning with Monotonicity Constraints for Alzheimer's\n  Disease Progression","summary":"  Alzheimer's disease (AD) is a devastating neurodegenerative condition that\nprecedes progressive and irreversible dementia; thus, predicting its\nprogression over time is vital for clinical diagnosis and treatment. Numerous\nstudies have implemented structural magnetic resonance imaging (MRI) to model\nAD progression, focusing on three integral aspects: (i) temporal variability,\n(ii) incomplete observations, and (iii) temporal geometric characteristics.\nHowever, deep learning-based approaches regarding data variability and sparsity\nhave yet to consider inherent geometrical properties sufficiently. The ordinary\ndifferential equation-based geometric modeling method (ODE-RGRU) has recently\nemerged as a promising strategy for modeling time-series data by intertwining a\nrecurrent neural network and an ODE in Riemannian space. Despite its\nachievements, ODE-RGRU encounters limitations when extrapolating positive\ndefinite symmetric metrics from incomplete samples, leading to feature reverse\noccurrences that are particularly problematic, especially within the clinical\nfacet. Therefore, this study proposes a novel geometric learning approach that\nmodels longitudinal MRI biomarkers and cognitive scores by combining three\nmodules: topological space shift, ODE-RGRU, and trajectory estimation. We have\nalso developed a training algorithm that integrates manifold mapping with\nmonotonicity constraints to reflect measurement transition irreversibility. We\nverify our proposed method's efficacy by predicting clinical labels and\ncognitive scores over time in regular and irregular settings. Furthermore, we\nthoroughly analyze our proposed framework through an ablation study.\n","authors":["Seungwoo Jeong","Wonsik Jung","Junghyo Sohn","Heung-Il Suk"],"pdf_url":"https://arxiv.org/pdf/2310.03353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03364v3","updated":"2023-10-05T07:06:27Z","published":"2023-06-06T02:38:01Z","title":"Learning Representations on the Unit Sphere: Investigating Angular\n  Gaussian and von Mises-Fisher Distributions for Online Continual Learning","summary":"  We use the maximum a posteriori estimation principle for learning\nrepresentations distributed on the unit sphere. We propose to use the angular\nGaussian distribution, which corresponds to a Gaussian projected on the\nunit-sphere and derive the associated loss function. We also consider the von\nMises-Fisher distribution, which is the conditional of a Gaussian in the\nunit-sphere. The learned representations are pushed toward fixed directions,\nwhich are the prior means of the Gaussians; allowing for a learning strategy\nthat is resilient to data drift. This makes it suitable for online continual\nlearning, which is the problem of training neural networks on a continuous data\nstream, where multiple classification tasks are presented sequentially so that\ndata from past tasks are no longer accessible, and data from the current task\ncan be seen only once. To address this challenging scenario, we propose a\nmemory-based representation learning technique equipped with our new loss\nfunctions. Our approach does not require negative data or knowledge of task\nboundaries and performs well with smaller batch sizes while being\ncomputationally efficient. We demonstrate with extensive experiments that the\nproposed method outperforms the current state-of-the-art methods on both\nstandard evaluation scenarios and realistic scenarios with blurry task\nboundaries. For reproducibility, we use the same training pipeline for every\ncompared method and share the code at https://t.ly/SQTj.\n","authors":["Nicolas Michel","Giovanni Chierchia","Romain Negrel","Jean-François Bercher"],"pdf_url":"https://arxiv.org/pdf/2306.03364v3.pdf","comment":"17 pages, under review, update title"},{"id":"http://arxiv.org/abs/2310.03349v1","updated":"2023-10-05T06:59:09Z","published":"2023-10-05T06:59:09Z","title":"An Integrated Algorithm for Robust and Imperceptible Audio Adversarial\n  Examples","summary":"  Audio adversarial examples are audio files that have been manipulated to fool\nan automatic speech recognition (ASR) system, while still sounding benign to a\nhuman listener. Most methods to generate such samples are based on a two-step\nalgorithm: first, a viable adversarial audio file is produced, then, this is\nfine-tuned with respect to perceptibility and robustness. In this work, we\npresent an integrated algorithm that uses psychoacoustic models and room\nimpulse responses (RIR) in the generation step. The RIRs are dynamically\ncreated by a neural network during the generation process to simulate a\nphysical environment to harden our examples against transformations experienced\nin over-the-air attacks. We compare the different approaches in three\nexperiments: in a simulated environment and in a realistic over-the-air\nscenario to evaluate the robustness, and in a human study to evaluate the\nperceptibility. Our algorithms considering psychoacoustics only or in addition\nto the robustness show an improvement in the signal-to-noise ratio (SNR) as\nwell as in the human perception study, at the cost of an increased word error\nrate (WER).\n","authors":["Armin Ettenhofer","Jan-Philipp Schulze","Karla Pizzi"],"pdf_url":"https://arxiv.org/pdf/2310.03349v1.pdf","comment":"Proc. 3rd Symposium on Security and Privacy in Speech Communication"},{"id":"http://arxiv.org/abs/2303.14655v2","updated":"2023-10-05T06:55:13Z","published":"2023-03-26T08:43:36Z","title":"GOAL: A Challenging Knowledge-grounded Video Captioning Benchmark for\n  Real-time Soccer Commentary Generation","summary":"  Despite the recent emergence of video captioning models, how to generate\nvivid, fine-grained video descriptions based on the background knowledge (i.e.,\nlong and informative commentary about the domain-specific scenes with\nappropriate reasoning) is still far from being solved, which however has great\napplications such as automatic sports narrative. In this paper, we present\nGOAL, a benchmark of over 8.9k soccer video clips, 22k sentences, and 42k\nknowledge triples for proposing a challenging new task setting as\nKnowledge-grounded Video Captioning (KGVC). Moreover, we conduct experimental\nadaption of existing methods to show the difficulty and potential directions\nfor solving this valuable and applicable task. Our data and code are available\nat https://github.com/THU-KEG/goal.\n","authors":["Ji Qi","Jifan Yu","Teng Tu","Kunyu Gao","Yifan Xu","Xinyu Guan","Xiaozhi Wang","Yuxiao Dong","Bin Xu","Lei Hou","Juanzi Li","Jie Tang","Weidong Guo","Hui Liu","Yu Xu"],"pdf_url":"https://arxiv.org/pdf/2303.14655v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2310.03342v1","updated":"2023-10-05T06:49:52Z","published":"2023-10-05T06:49:52Z","title":"LESSON: Learning to Integrate Exploration Strategies for Reinforcement\n  Learning via an Option Framework","summary":"  In this paper, a unified framework for exploration in reinforcement learning\n(RL) is proposed based on an option-critic model. The proposed framework learns\nto integrate a set of diverse exploration strategies so that the agent can\nadaptively select the most effective exploration strategy over time to realize\na relevant exploration-exploitation trade-off for each given task. The\neffectiveness of the proposed exploration framework is demonstrated by various\nexperiments in the MiniGrid and Atari environments.\n","authors":["Woojun Kim","Jeonghye Kim","Youngchul Sung"],"pdf_url":"https://arxiv.org/pdf/2310.03342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03339v1","updated":"2023-10-05T06:47:28Z","published":"2023-10-05T06:47:28Z","title":"Probabilistic Forecasting of Day-Ahead Electricity Prices and their\n  Volatility with LSTMs","summary":"  Accurate forecasts of electricity prices are crucial for the management of\nelectric power systems and the development of smart applications. European\nelectricity prices have risen substantially and became highly volatile after\nthe Russian invasion of Ukraine, challenging established forecasting methods.\nHere, we present a Long Short-Term Memory (LSTM) model for the\nGerman-Luxembourg day-ahead electricity prices addressing these challenges. The\nrecurrent structure of the LSTM allows the model to adapt to trends, while the\njoint prediction of both mean and standard deviation enables a probabilistic\nprediction. Using a physics-inspired approach - superstatistics - to derive an\nexplanation for the statistics of prices, we show that the LSTM model\nfaithfully reproduces both prices and their volatility.\n","authors":["Julius Trebbien","Sebastian Pütz","Benjamin Schäfer","Heidi S. Nygård","Leonardo Rydin Gorjão","Dirk Witthaut"],"pdf_url":"https://arxiv.org/pdf/2310.03339v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2310.00944v2","updated":"2023-10-05T06:37:23Z","published":"2023-10-02T07:34:15Z","title":"Towards Robust 3D Object Detection In Rainy Conditions","summary":"  LiDAR sensors are used in autonomous driving applications to accurately\nperceive the environment. However, they are affected by adverse weather\nconditions such as snow, fog, and rain. These everyday phenomena introduce\nunwanted noise into the measurements, severely degrading the performance of\nLiDAR-based perception systems. In this work, we propose a framework for\nimproving the robustness of LiDAR-based 3D object detectors against road spray.\nOur approach uses a state-of-the-art adverse weather detection network to\nfilter out spray from the LiDAR point cloud, which is then used as input for\nthe object detector. In this way, the detected objects are less affected by the\nadverse weather in the scene, resulting in a more accurate perception of the\nenvironment. In addition to adverse weather filtering, we explore the use of\nradar targets to further filter false positive detections. Tests on real-world\ndata show that our approach improves the robustness to road spray of several\npopular 3D object detectors.\n","authors":["Aldi Piroli","Vinzenz Dallabetta","Johannes Kopp","Marc Walessa","Daniel Meissner","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2310.00944v2.pdf","comment":"Published at IEEE International Conference on Intelligent\n  Transportation Systems ITSC 2023"},{"id":"http://arxiv.org/abs/2306.08827v2","updated":"2023-10-05T06:33:52Z","published":"2023-06-15T02:49:05Z","title":"PINNacle: A Comprehensive Benchmark of Physics-Informed Neural Networks\n  for Solving PDEs","summary":"  While significant progress has been made on Physics-Informed Neural Networks\n(PINNs), a comprehensive comparison of these methods across a wide range of\nPartial Differential Equations (PDEs) is still lacking. This study introduces\nPINNacle, a benchmarking tool designed to fill this gap. PINNacle provides a\ndiverse dataset, comprising over 20 distinct PDEs from various domains,\nincluding heat conduction, fluid dynamics, biology, and electromagnetics. These\nPDEs encapsulate key challenges inherent to real-world problems, such as\ncomplex geometry, multi-scale phenomena, nonlinearity, and high dimensionality.\nPINNacle also offers a user-friendly toolbox, incorporating about 10\nstate-of-the-art PINN methods for systematic evaluation and comparison. We have\nconducted extensive experiments with these methods, offering insights into\ntheir strengths and weaknesses. In addition to providing a standardized means\nof assessing performance, PINNacle also offers an in-depth analysis to guide\nfuture research, particularly in areas such as domain decomposition methods and\nloss reweighting for handling multi-scale problems and complex geometry. To the\nbest of our knowledge, it is the largest benchmark with a diverse and\ncomprehensive evaluation that will undoubtedly foster further research in\nPINNs.\n","authors":["Zhongkai Hao","Jiachen Yao","Chang Su","Hang Su","Ziao Wang","Fanzhi Lu","Zeyu Xia","Yichi Zhang","Songming Liu","Lu Lu","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2306.08827v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03334v1","updated":"2023-10-05T06:32:56Z","published":"2023-10-05T06:32:56Z","title":"Untargeted White-box Adversarial Attack with Heuristic Defence Methods\n  in Real-time Deep Learning based Network Intrusion Detection System","summary":"  Network Intrusion Detection System (NIDS) is a key component in securing the\ncomputer network from various cyber security threats and network attacks.\nHowever, consider an unfortunate situation where the NIDS is itself attacked\nand vulnerable more specifically, we can say, How to defend the defender?. In\nAdversarial Machine Learning (AML), the malicious actors aim to fool the\nMachine Learning (ML) and Deep Learning (DL) models to produce incorrect\npredictions with intentionally crafted adversarial examples. These adversarial\nperturbed examples have become the biggest vulnerability of ML and DL based\nsystems and are major obstacles to their adoption in real-time and\nmission-critical applications such as NIDS. AML is an emerging research domain,\nand it has become a necessity for the in-depth study of adversarial attacks and\ntheir defence strategies to safeguard the computer network from various cyber\nsecurity threads. In this research work, we aim to cover important aspects\nrelated to NIDS, adversarial attacks and its defence mechanism to increase the\nrobustness of the ML and DL based NIDS. We implemented four powerful\nadversarial attack techniques, namely, Fast Gradient Sign Method (FGSM),\nJacobian Saliency Map Attack (JSMA), Projected Gradient Descent (PGD) and\nCarlini & Wagner (C&W) in NIDS. We analyzed its performance in terms of various\nperformance metrics in detail. Furthermore, the three heuristics defence\nstrategies, i.e., Adversarial Training (AT), Gaussian Data Augmentation (GDA)\nand High Confidence (HC), are implemented to improve the NIDS robustness under\nadversarial attack situations. The complete workflow is demonstrated in\nreal-time network with data packet flow. This research work provides the\noverall background for the researchers interested in AML and its implementation\nfrom a computer network security point of view.\n","authors":["Khushnaseeb Roshan","Aasim Zafar","Sheikh Burhan Ul Haque"],"pdf_url":"https://arxiv.org/pdf/2310.03334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03331v1","updated":"2023-10-05T06:16:01Z","published":"2023-10-05T06:16:01Z","title":"Fine-tune Language Models to Approximate Unbiased In-context Learning","summary":"  In-context learning (ICL) is an astonishing emergent ability of large\nlanguage models (LLMs). By presenting a prompt that includes multiple\ninput-output pairs as examples and introducing a new query input, models can\ngenerate the corresponding output. However, the performance of models heavily\nrelies on the quality of the input prompt when implementing in-context\nlearning. Biased or imbalanced input prompts can significantly degrade the\nperformance of language models. To address this issue, we introduce a\nreweighted algorithm called RICL (Reweighted In-context Learning). This\nalgorithm fine-tunes language models using an unbiased validation set to\ndetermine the optimal weight for each input-output example to approximate\nunbiased in-context learning. Furthermore, we also introduce a low-cost\nreweighted algorithm, a linear optimal weight approximation algorithm called\nLARICL (Linear Approximation of Reweighted In-context Learning). This algorithm\nrequires minimal training cost while providing effective results. We prove the\nconvergence of our algorithm and validate its performance through experiments\nconducted on a numerical dataset. The experimental findings reveal a\nsubstantial improvement in comparison to benchmarks including the performance\nof casual prompt-based in-context learning and the performance of a classic\nfine-tuning method.\n","authors":["Timothy Chu","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2310.03331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03325v1","updated":"2023-10-05T05:41:21Z","published":"2023-10-05T05:41:21Z","title":"Learning Concept-Based Visual Causal Transition and Symbolic Reasoning\n  for Visual Planning","summary":"  Visual planning simulates how humans make decisions to achieve desired goals\nin the form of searching for visual causal transitions between an initial\nvisual state and a final visual goal state. It has become increasingly\nimportant in egocentric vision with its advantages in guiding agents to perform\ndaily tasks in complex environments. In this paper, we propose an interpretable\nand generalizable visual planning framework consisting of i) a novel\nSubstitution-based Concept Learner (SCL) that abstracts visual inputs into\ndisentangled concept representations, ii) symbol abstraction and reasoning that\nperforms task planning via the self-learned symbols, and iii) a Visual Causal\nTransition model (ViCT) that grounds visual causal transitions to semantically\nsimilar real-world actions. Given an initial state, we perform goal-conditioned\nvisual planning with a symbolic reasoning method fueled by the learned\nrepresentations and causal transitions to reach the goal state. To verify the\neffectiveness of the proposed model, we collect a large-scale visual planning\ndataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this\nchallenging dataset demonstrate the superior performance of our method in\nvisual task planning. Empirically, we show that our framework can generalize to\nunseen task trajectories and unseen object categories.\n","authors":["Yilue Qian","Peiyu Yu","Ying Nian Wu","Wei Wang","Lifeng Fan"],"pdf_url":"https://arxiv.org/pdf/2310.03325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12081v2","updated":"2023-10-05T05:40:00Z","published":"2023-05-20T03:37:09Z","title":"MediTab: Scaling Medical Tabular Data Predictors via Data Consolidation,\n  Enrichment, and Refinement","summary":"  Tabular data prediction has been employed in medical applications such as\npatient health risk prediction. However, existing methods usually revolve\naround the algorithm design while overlooking the significance of data\nengineering. Medical tabular datasets frequently exhibit significant\nheterogeneity across different sources, with limited sample sizes per source.\nAs such, previous predictors are often trained on manually curated small\ndatasets that struggle to generalize across different tabular datasets during\ninference. This paper proposes to scale medical tabular data predictors\n(MediTab) to various tabular inputs with varying features. The method uses a\ndata engine that leverages large language models (LLMs) to consolidate tabular\nsamples to overcome the barrier across tables with distinct schema. It also\naligns out-domain data with the target task using a \"learn, annotate, and\nrefinement\" pipeline. The expanded training data then enables the pre-trained\nMediTab to infer for arbitrary tabular input in the domain without fine-tuning,\nresulting in significant improvements over supervised baselines: it reaches an\naverage ranking of 1.57 and 1.00 on 7 patient outcome prediction datasets and 3\ntrial outcome prediction datasets, respectively. In addition, MediTab exhibits\nimpressive zero-shot performances: it outperforms supervised XGBoost models by\n8.9% and 17.2% on average in two prediction tasks, respectively. The code is\navailable at https://github.com/RyanWangZf/MediTab.\n","authors":["Zifeng Wang","Chufan Gao","Cao Xiao","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2305.12081v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03324v1","updated":"2023-10-05T05:37:33Z","published":"2023-10-05T05:37:33Z","title":"Investigating the Limitation of CLIP Models: The Worst-Performing\n  Categories","summary":"  Contrastive Language-Image Pre-training (CLIP) provides a foundation model by\nintegrating natural language into visual concepts, enabling zero-shot\nrecognition on downstream tasks. It is usually expected that satisfactory\noverall accuracy can be achieved across numerous domains through well-designed\ntextual prompts. However, we found that their performance in the worst\ncategories is significantly inferior to the overall performance. For example,\non ImageNet, there are a total of 10 categories with class-wise accuracy as low\nas 0\\%, even though the overall performance has achieved 64.1\\%. This\nphenomenon reveals the potential risks associated with using CLIP models,\nparticularly in risk-sensitive applications where specific categories hold\nsignificant importance. To address this issue, we investigate the alignment\nbetween the two modalities in the CLIP model and propose the Class-wise\nMatching Margin (\\cmm) to measure the inference confusion. \\cmm\\ can\neffectively identify the worst-performing categories and estimate the potential\nperformance of the candidate prompts. We further query large language models to\nenrich descriptions of worst-performing categories and build a weighted\nensemble to highlight the efficient prompts. Experimental results clearly\nverify the effectiveness of our proposal, where the accuracy on the worst-10\ncategories on ImageNet is boosted to 5.2\\%, without manual prompt engineering,\nlaborious optimization, or access to labeled validation data.\n","authors":["Jie-Jing Shao","Jiang-Xin Shi","Xiao-Wen Yang","Lan-Zhe Guo","Yu-Feng Li"],"pdf_url":"https://arxiv.org/pdf/2310.03324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03320v1","updated":"2023-10-05T05:30:42Z","published":"2023-10-05T05:30:42Z","title":"BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph","summary":"  Foundation models (FMs) are able to leverage large volumes of unlabeled data\nto demonstrate superior performance across a wide range of tasks. However, FMs\ndeveloped for biomedical domains have largely remained unimodal, i.e.,\nindependently trained and used for tasks on protein sequences alone, small\nmolecule structures alone, or clinical data alone. To overcome this limitation\nof biomedical FMs, we present BioBridge, a novel parameter-efficient learning\nframework, to bridge independently trained unimodal FMs to establish multimodal\nbehavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn\ntransformations between one unimodal FM and another without fine-tuning any\nunderlying unimodal FMs. Our empirical results demonstrate that BioBridge can\nbeat the best baseline KG embedding methods (on average by around 76.3%) in\ncross-modal retrieval tasks. We also identify BioBridge demonstrates\nout-of-domain generalization ability by extrapolating to unseen modalities or\nrelations. Additionally, we also show that BioBridge presents itself as a\ngeneral purpose retriever that can aid biomedical multimodal question answering\nas well as enhance the guided generation of novel drugs.\n","authors":["Zifeng Wang","Zichen Wang","Balasubramaniam Srinivasan","Vassilis N. Ioannidis","Huzefa Rangwala","Rishita Anubhai"],"pdf_url":"https://arxiv.org/pdf/2310.03320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15357v3","updated":"2023-10-05T05:23:07Z","published":"2023-05-24T17:09:54Z","title":"Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image\n  Super-Resolution","summary":"  Diffusion models, as a kind of powerful generative model, have given\nimpressive results on image super-resolution (SR) tasks. However, due to the\nrandomness introduced in the reverse process of diffusion models, the\nperformances of diffusion-based SR models are fluctuating at every time of\nsampling, especially for samplers with few resampled steps. This inherent\nrandomness of diffusion models results in ineffectiveness and instability,\nmaking it challenging for users to guarantee the quality of SR results.\nHowever, our work takes this randomness as an opportunity: fully analyzing and\nleveraging it leads to the construction of an effective plug-and-play sampling\nmethod that owns the potential to benefit a series of diffusion-based SR\nmethods. More in detail, we propose to steadily sample high-quality SR images\nfrom pre-trained diffusion-based SR models by solving diffusion ordinary\ndifferential equations (diffusion ODEs) with optimal boundary conditions (BCs)\nand analyze the characteristics between the choices of BCs and their\ncorresponding SR results. Our analysis shows the route to obtain an\napproximately optimal BC via an efficient exploration in the whole space. The\nquality of SR results sampled by the proposed method with fewer steps\noutperforms the quality of results sampled by current methods with randomness\nfrom the same pre-trained diffusion-based SR model, which means that our\nsampling method \"boosts\" current diffusion-based SR models without any\nadditional training.\n","authors":["Yiyang Ma","Huan Yang","Wenhan Yang","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.15357v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01944v4","updated":"2023-10-05T05:22:43Z","published":"2022-10-04T22:41:33Z","title":"A Framework for Large Scale Synthetic Graph Dataset Generation","summary":"  Recently there has been increasing interest in developing and deploying deep\ngraph learning algorithms for many tasks, such as fraud detection and\nrecommender systems. Albeit, there is a limited number of publicly available\ngraph-structured datasets, most of which are tiny compared to production-sized\napplications or are limited in their application domain. This work tackles this\nshortcoming by proposing a scalable synthetic graph generation tool to scale\nthe datasets to production-size graphs with trillions of edges and billions of\nnodes. The tool learns a series of parametric models from proprietary datasets\nthat can be released to researchers to study various graph methods on the\nsynthetic data increasing prototype development and novel applications. We\ndemonstrate the generalizability of the framework across a series of datasets,\nmimicking structural and feature distributions as well as the ability to scale\nthem across varying sizes demonstrating their usefulness for benchmarking and\nmodel development. Code can be found on\nhttps://github.com/NVIDIA/DeepLearningExamples/tree/master/Tools/DGLPyTorch/SyntheticGraphGeneration.\n","authors":["Sajad Darabi","Piotr Bigaj","Dawid Majchrowski","Artur Kasymov","Pawel Morkisz","Alex Fit-Florea"],"pdf_url":"https://arxiv.org/pdf/2210.01944v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03314v1","updated":"2023-10-05T05:12:14Z","published":"2023-10-05T05:12:14Z","title":"Enhanced Human-Robot Collaboration using Constrained Probabilistic\n  Human-Motion Prediction","summary":"  Human motion prediction is an essential step for efficient and safe\nhuman-robot collaboration. Current methods either purely rely on representing\nthe human joints in some form of neural network-based architecture or use\nregression models offline to fit hyper-parameters in the hope of capturing a\nmodel encompassing human motion. While these methods provide good initial\nresults, they are missing out on leveraging well-studied human body kinematic\nmodels as well as body and scene constraints which can help boost the efficacy\nof these prediction frameworks while also explicitly avoiding implausible human\njoint configurations. We propose a novel human motion prediction framework that\nincorporates human joint constraints and scene constraints in a Gaussian\nProcess Regression (GPR) model to predict human motion over a set time horizon.\nThis formulation is combined with an online context-aware constraints model to\nleverage task-dependent motions. It is tested on a human arm kinematic model\nand implemented on a human-robot collaborative setup with a UR5 robot arm to\ndemonstrate the real-time capability of our approach. Simulations were also\nperformed on datasets like HA4M and ANDY. The simulation and experimental\nresults demonstrate considerable improvements in a Gaussian Process framework\nwhen these constraints are explicitly considered.\n","authors":["Aadi Kothari","Tony Tohme","Xiaotong Zhang","Kamal Youcef-Toumi"],"pdf_url":"https://arxiv.org/pdf/2310.03314v1.pdf","comment":"7 pages, 5 figures. Associated video demonstration can be found at\n  https://www.youtube.com/@MITMechatronics"},{"id":"http://arxiv.org/abs/2305.15086v2","updated":"2023-10-05T05:12:09Z","published":"2023-05-24T12:05:24Z","title":"Unpaired Image-to-Image Translation via Neural Schrödinger Bridge","summary":"  Diffusion models are a powerful class of generative models which simulate\nstochastic differential equations (SDEs) to generate data from noise. Although\ndiffusion models have achieved remarkable progress in recent years, they have\nlimitations in the unpaired image-to-image translation tasks due to the\nGaussian prior assumption. Schr\\\"odinger Bridge (SB), which learns an SDE to\ntranslate between two arbitrary distributions, have risen as an attractive\nsolution to this problem. However, none of SB models so far have been\nsuccessful at unpaired translation between high-resolution images. In this\nwork, we propose the Unpaired Neural Schr\\\"odinger Bridge (UNSB), which\nexpresses SB problem as a sequence of adversarial learning problems. This\nallows us to incorporate advanced discriminators and regularization to learn a\nSB between unpaired data. We demonstrate that UNSB is scalable and successfully\nsolves various unpaired image-to-image translation tasks. Code:\n\\url{https://github.com/cyclomon/UNSB}\n","authors":["Beomsu Kim","Gihyun Kwon","Kwanyoung Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2305.15086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03312v1","updated":"2023-10-05T05:00:11Z","published":"2023-10-05T05:00:11Z","title":"Certifiably Robust Graph Contrastive Learning","summary":"  Graph Contrastive Learning (GCL) has emerged as a popular unsupervised graph\nrepresentation learning method. However, it has been shown that GCL is\nvulnerable to adversarial attacks on both the graph structure and node\nattributes. Although empirical approaches have been proposed to enhance the\nrobustness of GCL, the certifiable robustness of GCL is still remain\nunexplored. In this paper, we develop the first certifiably robust framework in\nGCL. Specifically, we first propose a unified criteria to evaluate and certify\nthe robustness of GCL. We then introduce a novel technique, RES (Randomized\nEdgedrop Smoothing), to ensure certifiable robustness for any GCL model, and\nthis certified robustness can be provably preserved in downstream tasks.\nFurthermore, an effective training method is proposed for robust GCL. Extensive\nexperiments on real-world datasets demonstrate the effectiveness of our\nproposed method in providing effective certifiable robustness and enhancing the\nrobustness of any GCL model. The source code of RES is available at\nhttps://github.com/ventr1c/RES-GCL.\n","authors":["Minhua Lin","Teng Xiao","Enyan Dai","Xiang Zhang","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03311v1","updated":"2023-10-05T04:59:58Z","published":"2023-10-05T04:59:58Z","title":"Deep Variational Multivariate Information Bottleneck -- A Framework for\n  Variational Losses","summary":"  Variational dimensionality reduction methods are known for their high\naccuracy, generative abilities, and robustness. These methods have many\ntheoretical justifications. Here we introduce a unifying principle rooted in\ninformation theory to rederive and generalize existing variational methods and\ndesign new ones. We base our framework on an interpretation of the multivariate\ninformation bottleneck, in which two Bayesian networks are traded off against\none another. We interpret the first network as an encoder graph, which\nspecifies what information to keep when compressing the data. We interpret the\nsecond network as a decoder graph, which specifies a generative model for the\ndata. Using this framework, we rederive existing dimensionality reduction\nmethods such as the deep variational information bottleneck (DVIB), beta\nvariational auto-encoders (beta-VAE), and deep variational canonical\ncorrelation analysis (DVCCA). The framework naturally introduces a trade-off\nparameter between compression and reconstruction in the DVCCA family of\nalgorithms, resulting in the new beta-DVCCA family. In addition, we derive a\nnew variational dimensionality reduction method, deep variational symmetric\ninformational bottleneck (DVSIB), which simultaneously compresses two variables\nto preserve information between their compressed representations. We implement\nall of these algorithms and evaluate their ability to produce shared low\ndimensional latent spaces on a modified noisy MNIST dataset. We show that\nalgorithms that are better matched to the structure of the data (beta-DVCCA and\nDVSIB) produce better latent spaces as measured by classification accuracy and\nthe dimensionality of the latent variables. We believe that this framework can\nbe used to unify other multi-view representation learning algorithms.\nAdditionally, it provides a straightforward framework for deriving\nproblem-specific loss functions.\n","authors":["Eslam Abdelaleem","Ilya Nemenman","K. Michael Martini"],"pdf_url":"https://arxiv.org/pdf/2310.03311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00079v3","updated":"2023-10-05T04:57:56Z","published":"2023-08-31T18:33:05Z","title":"On the Implicit Bias of Adam","summary":"  In previous literature, backward error analysis was used to find ordinary\ndifferential equations (ODEs) approximating the gradient descent trajectory. It\nwas found that finite step sizes implicitly regularize solutions because terms\nappearing in the ODEs penalize the two-norm of the loss gradients. We prove\nthat the existence of similar implicit regularization in RMSProp and Adam\ndepends on their hyperparameters and the training stage, but with a different\n\"norm\" involved: the corresponding ODE terms either penalize the (perturbed)\none-norm of the loss gradients or, on the contrary, hinder its decrease (the\nlatter case being typical). We also conduct numerical experiments and discuss\nhow the proven facts can influence generalization.\n","authors":["Matias D. Cattaneo","Jason M. Klusowski","Boris Shigida"],"pdf_url":"https://arxiv.org/pdf/2309.00079v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02062v3","updated":"2023-10-05T04:55:06Z","published":"2021-11-03T08:25:35Z","title":"Linking Across Data Granularity: Fitting Multivariate Hawkes Processes\n  to Partially Interval-Censored Data","summary":"  The multivariate Hawkes process (MHP) is widely used for analyzing data\nstreams that interact with each other, where events generate new events within\ntheir own dimension (via self-excitation) or across different dimensions (via\ncross-excitation). However, in certain applications, the timestamps of\nindividual events in some dimensions are unobservable, and only event counts\nwithin intervals are known, referred to as partially interval-censored data.\nThe MHP is unsuitable for handling such data since its estimation requires\nevent timestamps. In this study, we introduce the Partial Mean Behavior Poisson\n(PMBP) process, a novel point process which shares parameter equivalence with\nthe MHP and can effectively model both timestamped and interval-censored data.\nWe demonstrate the capabilities of the PMBP process using synthetic and\nreal-world datasets. Firstly, we illustrate that the PMBP process can\napproximate MHP parameters and recover the spectral radius using synthetic\nevent histories. Next, we assess the performance of the PMBP process in\npredicting YouTube popularity and find that it surpasses state-of-the-art\nmethods. Lastly, we leverage the PMBP process to gain qualitative insights from\na dataset comprising daily COVID-19 case counts from multiple countries and\nCOVID-19-related news articles. By clustering the PMBP-modeled countries, we\nunveil hidden interaction patterns between occurrences of COVID-19 cases and\nnews reporting.\n","authors":["Pio Calderon","Alexander Soen","Marian-Andrei Rizoiu"],"pdf_url":"https://arxiv.org/pdf/2111.02062v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2306.03116v2","updated":"2023-10-05T04:51:58Z","published":"2023-06-05T13:43:29Z","title":"Transferring Annotator- and Instance-dependent Transition Matrix for\n  Learning from Crowds","summary":"  Learning from crowds describes that the annotations of training data are\nobtained with crowd-sourcing services. Multiple annotators each complete their\nown small part of the annotations, where labeling mistakes that depend on\nannotators occur frequently. Modeling the label-noise generation process by the\nnoise transition matrix is a power tool to tackle the label noise. In\nreal-world crowd-sourcing scenarios, noise transition matrices are both\nannotator- and instance-dependent. However, due to the high complexity of\nannotator- and instance-dependent transition matrices (AIDTM), annotation\nsparsity, which means each annotator only labels a little part of instances,\nmakes modeling AIDTM very challenging. Prior works simplify the problem by\nassuming the transition matrix is instance-independent or using simple\nparametric ways, which lose modeling generality. Motivated by this, we target a\nmore realistic problem, estimating general AIDTM in practice. Without losing\nmodeling generality, we parameterize AIDTM with deep neural networks. To\nalleviate the modeling challenge, we suppose every annotator shares its noise\npattern with similar annotators, and estimate AIDTM via knowledge transfer. We\nhence first model the mixture of noise patterns by all annotators, and then\ntransfer this modeling to individual annotators. Furthermore, considering that\nthe transfer from the mixture of noise patterns to individuals may cause two\nannotators with highly different noise generations to perturb each other, we\nemploy the knowledge transfer between identified neighboring annotators to\ncalibrate the modeling. Theoretical analyses are derived to demonstrate that\nboth the knowledge transfer from global to individuals and the knowledge\ntransfer between neighboring individuals can help model general AIDTM.\nExperiments confirm the superiority of the proposed approach on synthetic and\nreal-world crowd-sourcing data.\n","authors":["Shikun Li","Xiaobo Xia","Jiankang Deng","Shiming Ge","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2306.03116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02936v2","updated":"2023-10-05T04:47:52Z","published":"2023-02-06T17:11:09Z","title":"Private GANs, Revisited","summary":"  We show that the canonical approach for training differentially private GANs\n-- updating the discriminator with differentially private stochastic gradient\ndescent (DPSGD) -- can yield significantly improved results after modifications\nto training. Specifically, we propose that existing instantiations of this\napproach neglect to consider how adding noise only to discriminator updates\ninhibits discriminator training, disrupting the balance between the generator\nand discriminator necessary for successful GAN training. We show that a simple\nfix -- taking more discriminator steps between generator steps -- restores\nparity between the generator and discriminator and improves results.\n  Additionally, with the goal of restoring parity, we experiment with other\nmodifications -- namely, large batch sizes and adaptive discriminator update\nfrequency -- to improve discriminator training and see further improvements in\ngeneration quality. Our results demonstrate that on standard image synthesis\nbenchmarks, DPSGD outperforms all alternative GAN privatization schemes. Code:\nhttps://github.com/alexbie98/dpgan-revisit.\n","authors":["Alex Bie","Gautam Kamath","Guojun Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.02936v2.pdf","comment":"28 pages; revisions and new experiments from TMLR camera-ready + code\n  release at https://github.com/alexbie98/dpgan-revisit"},{"id":"http://arxiv.org/abs/2309.11745v2","updated":"2023-10-05T04:45:21Z","published":"2023-09-21T02:46:32Z","title":"PIE: Simulating Disease Progression via Progressive Image Editing","summary":"  Disease progression simulation is a crucial area of research that has\nsignificant implications for clinical diagnosis, prognosis, and treatment. One\nmajor challenge in this field is the lack of continuous medical imaging\nmonitoring of individual patients over time. To address this issue, we develop\na novel framework termed Progressive Image Editing (PIE) that enables\ncontrolled manipulation of disease-related image features, facilitating precise\nand realistic disease progression simulation. Specifically, we leverage recent\nadvancements in text-to-image generative models to simulate disease progression\naccurately and personalize it for each patient. We theoretically analyze the\niterative refining process in our framework as a gradient descent with an\nexponentially decayed learning rate. To validate our framework, we conduct\nexperiments in three medical imaging domains. Our results demonstrate the\nsuperiority of PIE over existing methods such as Stable Diffusion Walk and\nStyle-Based Manifold Extrapolation based on CLIP score (Realism) and Disease\nClassification Confidence (Alignment). Our user study collected feedback from\n35 veteran physicians to assess the generated progressions. Remarkably, 76.2%\nof the feedback agrees with the fidelity of the generated progressions. To our\nbest knowledge, PIE is the first of its kind to generate disease progression\nimages meeting real-world standards. It is a promising tool for medical\nresearch and clinical practice, potentially allowing healthcare providers to\nmodel disease trajectories over time, predict future treatment responses, and\nimprove patient outcomes.\n","authors":["Kaizhao Liang","Xu Cao","Kuei-Da Liao","Tianren Gao","Wenqian Ye","Zhengyu Chen","Jianguo Cao","Tejas Nama","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2309.11745v2.pdf","comment":"Code and checkpoints for replicating our results can be found at\n  https://github.com/IrohXu/PIE and\n  https://huggingface.co/IrohXu/stable-diffusion-mimic-cxr-v0.1"},{"id":"http://arxiv.org/abs/2110.14883v3","updated":"2023-10-05T04:09:09Z","published":"2021-10-28T04:45:55Z","title":"Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel\n  Training","summary":"  The success of Transformer models has pushed the deep learning model scale to\nbillions of parameters. Due to the limited memory resource of a single GPU,\nHowever, the best practice for choosing the optimal parallel strategy is still\nlacking, since it requires domain expertise in both deep learning and parallel\ncomputing.\n  The Colossal-AI system addressed the above challenge by introducing a unified\ninterface to scale your sequential code of model training to distributed\nenvironments. It supports parallel training methods such as data, pipeline,\ntensor, and sequence parallelism, as well as heterogeneous training methods\nintegrated with zero redundancy optimizer. Compared to the baseline system,\nColossal-AI can achieve up to 2.76 times training speedup on large-scale\nmodels.\n","authors":["Shenggui Li","Hongxin Liu","Zhengda Bian","Jiarui Fang","Haichen Huang","Yuliang Liu","Boxiang Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2110.14883v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12439v2","updated":"2023-10-05T04:08:47Z","published":"2023-08-23T21:47:06Z","title":"BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input\n  Detection","summary":"  We present a novel defense, against backdoor attacks on Deep Neural Networks\n(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors)\ninto DNNs. Our defense falls within the category of post-development defenses\nthat operate independently of how the model was generated. The proposed defense\nis built upon a novel reverse engineering approach that can directly extract\nbackdoor functionality of a given backdoored model to a backdoor expert model.\nThe approach is straightforward -- finetuning the backdoored model over a small\nset of intentionally mislabeled clean samples, such that it unlearns the normal\nfunctionality while still preserving the backdoor functionality, and thus\nresulting in a model (dubbed a backdoor expert model) that can only recognize\nbackdoor inputs. Based on the extracted backdoor expert model, we show the\nfeasibility of devising highly accurate backdoor input detectors that filter\nout the backdoor inputs during model inference. Further augmented by an\nensemble strategy with a finetuned auxiliary model, our defense, BaDExpert\n(Backdoor Input Detection with Backdoor Expert), effectively mitigates 17 SOTA\nbackdoor attacks while minimally impacting clean utility. The effectiveness of\nBaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet)\nacross various model architectures (ResNet, VGG, MobileNetV2 and Vision\nTransformer).\n","authors":["Tinghao Xie","Xiangyu Qi","Ping He","Yiming Li","Jiachen T. Wang","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2308.12439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03302v1","updated":"2023-10-05T04:06:12Z","published":"2023-10-05T04:06:12Z","title":"Benchmarking Large Language Models As AI Research Agents","summary":"  Scientific experimentation involves an iterative process of creating\nhypotheses, designing experiments, running experiments, and analyzing the\nresults. Can we build AI research agents to perform these long-horizon tasks?\nTo take a step towards building and evaluating research agents on such\nopen-ended decision-making tasks, we focus on the problem of machine learning\nengineering: given a task description and a dataset, build a high-performing\nmodel. In this paper, we propose MLAgentBench, a suite of ML tasks for\nbenchmarking AI research agents. Agents can perform actions like\nreading/writing files, executing code, and inspecting outputs. With these\nactions, agents could run experiments, analyze the results, and modify the code\nof entire machine learning pipelines, such as data processing, architecture,\ntraining processes, etc. The benchmark then automatically evaluates the agent's\nperformance objectively over various metrics related to performance and\nefficiency. We also design an LLM-based research agent to automatically perform\nexperimentation loops in such an environment. Empirically, we find that a\nGPT-4-based research agent can feasibly build compelling ML models over many\ntasks in MLAgentBench, displaying highly interpretable plans and actions.\nHowever, the success rates vary considerably; they span from almost 90\\% on\nwell-established older datasets to as low as 10\\% on recent Kaggle Challenges\n-- unavailable during the LLM model's pretraining -- and even 0\\% on newer\nresearch challenges like BabyLM. Finally, we identify several key challenges\nfor LLM-based research agents such as long-term planning and hallucination. Our\ncode is released at https://github.com/snap-stanford/MLAgentBench.\n","authors":["Qian Huang","Jian Vora","Percy Liang","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2310.03302v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.03507v1","updated":"2023-10-05T12:39:27Z","published":"2023-10-05T12:39:27Z","title":"RL-based Stateful Neural Adaptive Sampling and Denoising for Real-Time\n  Path Tracing","summary":"  Monte-Carlo path tracing is a powerful technique for realistic image\nsynthesis but suffers from high levels of noise at low sample counts, limiting\nits use in real-time applications. To address this, we propose a framework with\nend-to-end training of a sampling importance network, a latent space encoder\nnetwork, and a denoiser network. Our approach uses reinforcement learning to\noptimize the sampling importance network, thus avoiding explicit numerically\napproximated gradients. Our method does not aggregate the sampled values per\npixel by averaging but keeps all sampled values which are then fed into the\nlatent space encoder. The encoder replaces handcrafted spatiotemporal\nheuristics by learned representations in a latent space. Finally, a neural\ndenoiser is trained to refine the output image. Our approach increases visual\nquality on several challenging datasets and reduces rendering times for equal\nquality by a factor of 1.6x compared to the previous state-of-the-art, making\nit a promising solution for real-time applications.\n","authors":["Antoine Scardigli","Lukas Cavigelli","Lorenz K. Müller"],"pdf_url":"https://arxiv.org/pdf/2310.03507v1.pdf","comment":"Submitted to NeurIPS. https://openreview.net/forum?id=xNyR7DXUzJ"},{"id":"http://arxiv.org/abs/2310.03456v1","updated":"2023-10-05T10:54:33Z","published":"2023-10-05T10:54:33Z","title":"Multi-Resolution Audio-Visual Feature Fusion for Temporal Action\n  Localization","summary":"  Temporal Action Localization (TAL) aims to identify actions' start, end, and\nclass labels in untrimmed videos. While recent advancements using transformer\nnetworks and Feature Pyramid Networks (FPN) have enhanced visual feature\nrecognition in TAL tasks, less progress has been made in the integration of\naudio features into such frameworks. This paper introduces the Multi-Resolution\nAudio-Visual Feature Fusion (MRAV-FF), an innovative method to merge\naudio-visual data across different temporal resolutions. Central to our\napproach is a hierarchical gated cross-attention mechanism, which discerningly\nweighs the importance of audio information at diverse temporal scales. Such a\ntechnique not only refines the precision of regression boundaries but also\nbolsters classification confidence. Importantly, MRAV-FF is versatile, making\nit compatible with existing FPN TAL architectures and offering a significant\nenhancement in performance when audio data is available.\n","authors":["Edward Fish","Jon Weinbren","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2310.03456v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.03256v1","updated":"2023-10-05T02:18:55Z","published":"2023-10-05T02:18:55Z","title":"Toward One-Second Latency: Evolution of Live Media Streaming","summary":"  This survey presents the evolution of live media streaming and the\ntechnological developments behind today's IP-based low-latency live streaming\nsystems. Live streaming primarily involves capturing, encoding, packaging and\ndelivering real-time events such as live sports, live news, personal broadcasts\nand surveillance videos. Live streaming also involves concurrent streaming of\nlinear TV programming off the satellite, cable, over-the-air or IPTV broadcast,\nwhere the programming is not necessarily a real-time event. The survey starts\nwith a discussion on the latency and latency continuum in streaming\napplications. Then, it lays out the existing live streaming workflows and\nprotocols, followed by an in-depth analysis of the latency sources in these\nworkflows and protocols. The survey continues with the technology enablers,\nlow-latency extensions for the popular HTTP adaptive streaming methods and\nenhancements for robust low-latency playback. An entire section is dedicated to\nthe detailed summary and findings of Twitch's grand challenge on low-latency\nlive streaming. The survey concludes with a discussion of ongoing research\nproblems in this space.\n","authors":["Abdelhak Bentaleb","May Lim","Mehmet N. Akcay","Ali C. Begen","Sarra Hammoudi","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2310.03256v1.pdf","comment":null}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..57fd4b82
--- /dev/null
+++ b/index.html
@@ -0,0 +1,57691 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-05T00:00:00Z">2023-10-05</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">55</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Baselines with Visual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Liu, Chunyuan Li, Yuheng Li, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal models (LMM) have recently shown encouraging progress with
+visual instruction tuning. In this note, we show that the fully-connected
+vision-language cross-modal connector in LLaVA is surprisingly powerful and
+data-efficient. With simple modifications to LLaVA, namely, using
+CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA
+data with simple response formatting prompts, we establish stronger baselines
+that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint
+uses merely 1.2M publicly available data, and finishes full training in ~1 day
+on a single 8-A100 node. We hope this can make state-of-the-art LMM research
+more accessible. Code and model will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech report, 4 pages. LLaVA project page: https://llava-vl.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathCoder: Seamless Code Integration in LLMs for Enhanced Mathematical
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Wang, Houxing Ren, Aojun Zhou, Zimu Lu, Sichun Luo, Weikang Shi, Renrui Zhang, Linqi Song, Mingjie Zhan, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently released GPT-4 Code Interpreter has demonstrated remarkable
+proficiency in solving challenging math problems, primarily attributed to its
+ability to seamlessly reason with natural language, generate code, execute
+code, and continue reasoning based on the execution output. In this paper, we
+present a method to fine-tune open-source language models, enabling them to use
+code for modeling and deriving math equations and, consequently, enhancing
+their mathematical reasoning abilities. We propose a method of generating novel
+and high-quality datasets with math problems and their code-based solutions,
+referred to as MathCodeInstruct. Each solution interleaves natural language,
+code, and execution results. We also introduce a customized supervised
+fine-tuning and inference approach. This approach yields the MathCoder models,
+a family of models capable of generating code-based solutions for solving
+challenging math problems. Impressively, the MathCoder models achieve
+state-of-the-art scores among open-source LLMs on the MATH (45.2%) and GSM8K
+(83.9%) datasets, substantially outperforming other open-source alternatives.
+Notably, the MathCoder model not only surpasses ChatGPT-3.5 and PaLM-2 on GSM8K
+and MATH but also outperforms GPT-4 on the competition-level MATH dataset. The
+dataset and models will be released at https://github.com/mathllm/MathCoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The state-of-the-art open-source language models for mathematical
+  reasoning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modular Speech-to-Text Translation for Zero-Shot Cross-Modal Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul-Ambroise Duquenne, Holger Schwenk, Benoît Sagot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research has shown that independently trained encoders and decoders,
+combined through a shared fixed-size representation, can achieve competitive
+performance in speech-to-text translation. In this work, we show that this type
+of approach can be further improved with multilingual training. We observe
+significant improvements in zero-shot cross-modal speech translation, even
+outperforming a supervised approach based on XLSR for several languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Long Way to Go: Investigating Length Correlations in RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prasann Singhal, Tanya Goyal, Jiacheng Xu, Greg Durrett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Great successes have been reported using Reinforcement Learning from Human
+Feedback (RLHF) to align large language models. Open-source preference datasets
+and reward models have enabled wider experimentation beyond generic chat
+settings, particularly to make systems more "helpful" for tasks like web
+question answering, summarization, and multi-turn dialogue. When optimizing for
+helpfulness, RLHF has been consistently observed to drive models to produce
+longer outputs. This paper demonstrates that optimizing for response length is
+a significant factor behind RLHF's reported improvements in these settings.
+First, we study the relationship between reward and length for reward models
+trained on three open-source preference datasets for helpfulness. Here, length
+correlates strongly with reward, and improvements in reward score are driven in
+large part by shifting the distribution over output lengths. We then explore
+interventions during both RL and reward model learning to see if we can achieve
+the same downstream improvements as RLHF without increasing length. While our
+interventions mitigate length increases, they aren't uniformly effective across
+settings. Furthermore, we find that even running RLHF with a reward based
+solely on length can reproduce most of the downstream improvements over the
+initial policy model, showing that reward models in these settings have a long
+way to go.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSPy: Compiling Declarative Language Model Calls into Self-Improving
+  Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T. Joshi, Hanna Moazam, Heather Miller, Matei Zaharia, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ML community is rapidly exploring techniques for prompting language
+models (LMs) and for stacking them into pipelines that solve complex tasks.
+Unfortunately, existing LM pipelines are typically implemented using hard-coded
+"prompt templates", i.e. lengthy strings discovered via trial and error. Toward
+a more systematic approach for developing and optimizing LM pipelines, we
+introduce DSPy, a programming model that abstracts LM pipelines as text
+transformation graphs, i.e. imperative computational graphs where LMs are
+invoked through declarative modules. DSPy modules are parameterized, meaning
+they can learn (by creating and collecting demonstrations) how to apply
+compositions of prompting, finetuning, augmentation, and reasoning techniques.
+We design a compiler that will optimize any DSPy pipeline to maximize a given
+metric. We conduct two case studies, showing that succinct DSPy programs can
+express and optimize sophisticated LM pipelines that reason about math word
+problems, tackle multi-hop retrieval, answer complex questions, and control
+agent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and
+llama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot
+prompting (generally by over 25% and 65%, respectively) and pipelines with
+expert-created demonstrations (by up to 5-46% and 16-40%, respectively). On top
+of that, DSPy programs compiled to open and relatively small LMs like
+770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely
+on expert-written prompt chains for proprietary GPT-3.5. DSPy is available at
+https://github.com/stanfordnlp/dspy
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agent Instructs Large Language Models to be General Zero-Shot Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Crispino, Kyle Montgomery, Fankun Zeng, Dawn Song, Chenguang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method to improve the zero-shot reasoning abilities of large
+language models on general language understanding tasks. Specifically, we build
+an autonomous agent to instruct the reasoning process of large language models.
+We show this approach further unleashes the zero-shot reasoning abilities of
+large language models to more tasks. We study the performance of our method on
+a wide set of datasets spanning generation, classification, and reasoning. We
+show that our method generalizes to most tasks and obtains state-of-the-art
+zero-shot performance on 20 of the 29 datasets that we evaluate. For instance,
+our method boosts the performance of state-of-the-art large language models by
+a large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and
+GPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement
+in reasoning is striking, with an average increase of 10.5%. With our method,
+Llama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-tuning Aligned Language Models Compromises Safety, Even When Users
+  Do Not Intend To! 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Qi, Yi Zeng, Tinghao Xie, Pin-Yu Chen, Ruoxi Jia, Prateek Mittal, Peter Henderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing large language models (LLMs) for downstream use cases often
+involves the customization of pre-trained LLMs through further fine-tuning.
+Meta's open release of Llama models and OpenAI's APIs for fine-tuning GPT-3.5
+Turbo on custom datasets also encourage this practice. But, what are the safety
+costs associated with such custom fine-tuning? We note that while existing
+safety alignment infrastructures can restrict harmful behaviors of LLMs at
+inference time, they do not cover safety risks when fine-tuning privileges are
+extended to end-users. Our red teaming studies find that the safety alignment
+of LLMs can be compromised by fine-tuning with only a few adversarially
+designed training examples. For instance, we jailbreak GPT-3.5 Turbo's safety
+guardrails by fine-tuning it on only 10 such examples at a cost of less than
+$0.20 via OpenAI's APIs, making the model responsive to nearly any harmful
+instructions. Disconcertingly, our research also reveals that, even without
+malicious intent, simply fine-tuning with benign and commonly used datasets can
+also inadvertently degrade the safety alignment of LLMs, though to a lesser
+extent. These findings suggest that fine-tuning aligned LLMs introduces new
+safety risks that current safety infrastructures fall short of addressing --
+even if a model's initial safety alignment is impeccable, it is not necessarily
+to be maintained after custom fine-tuning. We outline and critically analyze
+potential mitigations and advocate for further research efforts toward
+reinforcing safety protocols for the custom fine-tuning of aligned LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DecoderLens: Layerwise Interpretation of Encoder-Decoder <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Langedijk, Hosein Mohebbi, Gabriele Sarti, Willem Zuidema, Jaap Jumelet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, many interpretability methods have been proposed to help
+interpret the internal states of Transformer-models, at different levels of
+precision and complexity. Here, to analyze encoder-decoder Transformers, we
+propose a simple, new method: DecoderLens. Inspired by the LogitLens (for
+decoder-only Transformers), this method involves allowing the decoder to
+cross-attend representations of intermediate encoder layers instead of using
+the final encoder output, as is normally done in encoder-decoder models. The
+method thus maps previously uninterpretable vector representations to
+human-interpretable sequences of words or symbols. We report results from the
+DecoderLens applied to models trained on question answering, logical reasoning,
+speech recognition and machine translation. The DecoderLens reveals several
+specific subtasks that are solved at low or intermediate layers, shedding new
+light on the information flow inside the encoder component of this important
+class of models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GoLLIE: Annotation Guidelines improve Zero-Shot Information-Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Sainz, Iker García-Ferrero, Rodrigo Agerri, Oier Lopez de Lacalle, German Rigau, Eneko Agirre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) combined with instruction tuning have made
+significant progress when generalizing to unseen tasks. However, they have been
+less successful in Information Extraction (IE), lagging behind task-specific
+models. Typically, IE tasks are characterized by complex annotation guidelines
+which describe the task and give examples to humans. Previous attempts to
+leverage such information have failed, even with the largest models, as they
+are not able to follow the guidelines out-of-the-box. In this paper we propose
+GoLLIE (Guideline-following Large Language Model for IE), a model able to
+improve zero-shot results on unseen IE tasks by virtue of being fine-tuned to
+comply with annotation guidelines. Comprehensive evaluation empirically
+demonstrates that GoLLIE is able to generalize to and follow unseen guidelines,
+outperforming previous attempts at zero-shot information extraction. The
+ablation study shows that detailed guidelines is key for good results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mapper<span class="highlight-title">GPT</span>: Large Language Models for Linking and Mapping Entities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Matentzoglu, J. Harry Caufield, Harshad B. Hegde, Justin T. Reese, Sierra Moxon, Hyeongsik Kim, Nomi L. Harris, Melissa A Haendel, Christopher J. Mungall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning terminological resources, including ontologies, controlled
+vocabularies, taxonomies, and value sets is a critical part of data integration
+in many domains such as healthcare, chemistry, and biomedical research. Entity
+mapping is the process of determining correspondences between entities across
+these resources, such as gene identifiers, disease concepts, or chemical entity
+identifiers. Many tools have been developed to compute such mappings based on
+common structural features and lexical information such as labels and synonyms.
+Lexical approaches in particular often provide very high recall, but low
+precision, due to lexical ambiguity. As a consequence of this, mapping efforts
+often resort to a labor intensive manual mapping refinement through a human
+curator.
+  Large Language Models (LLMs), such as the ones employed by ChatGPT, have
+generalizable abilities to perform a wide range of tasks, including
+question-answering and information extraction. Here we present MapperGPT, an
+approach that uses LLMs to review and refine mapping relationships as a
+post-processing step, in concert with existing high-recall methods that are
+based on lexical and structural heuristics.
+  We evaluated MapperGPT on a series of alignment tasks from different domains,
+including anatomy, developmental biology, and renal diseases. We devised a
+collection of tasks that are designed to be particularly challenging for
+lexical methods. We show that when used in combination with high-recall
+methods, MapperGPT can provide a substantial improvement in accuracy, beating
+state-of-the-art (SOTA) methods such as LogMap.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TRAM: Bridging Trust Regions and Sharpness Aware Minimization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Sherborne, Naomi Saphra, Pradeep Dasigi, Hao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By reducing the curvature of the loss surface in the parameter space,
+Sharpness-aware minimization (SAM) yields widespread robustness improvement
+under domain transfer. Instead of focusing on parameters, however, this work
+considers the transferability of representations as the optimization target for
+out-of-domain generalization in a fine-tuning setup. To encourage the retention
+of transferable representations, we consider trust region-based fine-tuning
+methods, which exploit task-specific skills without forgetting task-agnostic
+representations from pre-training. We unify parameter- and representation-space
+smoothing approaches by using trust region bounds to inform SAM-style
+regularizers on both of these optimization surfaces. We propose Trust Region
+Aware Minimization (TRAM), a fine-tuning algorithm that optimizes for flat
+minima and smooth, informative representations without forgetting pre-trained
+structure. We find that TRAM outperforms both sharpness-aware and trust
+region-based optimization methods on cross-domain language modeling and
+cross-lingual transfer, where robustness to domain transfer and representation
+generality are critical for success. TRAM establishes a new standard in
+training generalizable models with minimal additional computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 tables, 1 figure. Submitted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating <span class="highlight-title">Self-Supervised</span> Speech Representations for Indigenous
+  American Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih-Chen Chen, William Chen, Rodolfo Zevallos, John Ortega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of self-supervision to speech representation learning has
+garnered significant interest in recent years, due to its scalability to large
+amounts of unlabeled data. However, much progress, both in terms of
+pre-training and downstream evaluation, has remained concentrated in
+monolingual models that only consider English. Few models consider other
+languages, and even fewer consider indigenous ones. In our submission to the
+New Language Track of the ASRU 2023 ML-SUPERB Challenge, we present an ASR
+corpus for Quechua, an indigenous South American Language. We benchmark the
+efficacy of large SSL models on Quechua, along with 6 other indigenous
+languages such as Guarani and Bribri, on low-resource ASR. Our results show
+surprisingly strong performance by state-of-the-art SSL models, showing the
+potential generalizability of large-scale models to real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLEVRER-Humans: Describing Physical and Causal Events the Human Way <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayuan Mao, Xuelin Yang, Xikun Zhang, Noah D. Goodman, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building machines that can reason about physical events and their causal
+relationships is crucial for flexible interaction with the physical world.
+However, most existing physical and causal reasoning benchmarks are exclusively
+based on synthetically generated events and synthetic natural language
+descriptions of causal relationships. This design brings up two issues. First,
+there is a lack of diversity in both event types and natural language
+descriptions; second, causal relationships based on manually-defined heuristics
+are different from human judgments. To address both shortcomings, we present
+the CLEVRER-Humans benchmark, a video reasoning dataset for causal judgment of
+physical events with human labels. We employ two techniques to improve data
+collection efficiency: first, a novel iterative event cloze task to elicit a
+new representation of events in videos, which we term Causal Event Graphs
+(CEGs); second, a data augmentation technique based on neural language
+generative models. We convert the collected CEGs into questions and answers to
+be consistent with prior work. Finally, we study a collection of baseline
+approaches for CLEVRER-Humans question-answering, highlighting the great
+challenges set forth by our benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 (Dataset and Benchmark Track). First two authors
+  contributed equally. Project page:
+  https://sites.google.com/stanford.edu/clevrer-humans/home</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redefining Digital Health Interfaces with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fergus Imrie, Paulius Rauba, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital health tools have the potential to significantly improve the delivery
+of healthcare services. However, their use remains comparatively limited due,
+in part, to challenges surrounding usability and trust. Recently, Large
+Language Models (LLMs) have emerged as general-purpose models with the ability
+to process complex information and produce human-quality text, presenting a
+wealth of potential applications in healthcare. Directly applying LLMs in
+clinical settings is not straightforward, with LLMs susceptible to providing
+inconsistent or nonsensical answers. We demonstrate how LLMs can utilize
+external tools to provide a novel interface between clinicians and digital
+technologies. This enhances the utility and practical impact of digital
+healthcare tools and AI models while addressing current issues with using LLM
+in clinical settings such as hallucinations. We illustrate our approach with
+examples from cardiovascular disease and diabetes risk prediction, highlighting
+the benefit compared to traditional interfaces for digital tools.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Robust and Generalizable Training: An Empirical Study of Noisy
+  Slot Filling for Input Perturbations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachi Liu, Liwen Wang, Guanting Dong, Xiaoshuai Song, Zechen Wang, Zhengyang Wang, Shanglin Lei, Jinzheng Zhao, Keqing He, Bo Xiao, Weiran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real dialogue scenarios, as there are unknown input noises in the
+utterances, existing supervised slot filling models often perform poorly in
+practical applications. Even though there are some studies on noise-robust
+models, these works are only evaluated on rule-based synthetic datasets, which
+is limiting, making it difficult to promote the research of noise-robust
+methods. In this paper, we introduce a noise robustness evaluation dataset
+named Noise-SF for slot filling task. The proposed dataset contains five types
+of human-annotated noise, and all those noises are exactly existed in real
+extensive robust-training methods of slot filling into the proposed framework.
+By conducting exhaustive empirical evaluation experiments on Noise-SF, we find
+that baseline models have poor performance in robustness evaluation, and the
+proposed framework can effectively improve the robustness of models. Based on
+the empirical experimental results, we make some forward-looking suggestions to
+fuel the research in this direction. Our dataset Noise-SF will be released at
+https://github.com/dongguanting/Noise-SF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tik-to-Tok: Translating Language Models One Token at a Time: An
+  Embedding Initialization Strategy for Efficient Language Adaptation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        François Remy, Pieter Delobelle, Bettina Berendt, Kris Demuynck, Thomas Demeester
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training monolingual language models for low and mid-resource languages is
+made challenging by limited and often inadequate pretraining data. In this
+study, we propose a novel model conversion strategy to address this issue,
+adapting high-resources monolingual language models to a new target language.
+By generalizing over a word translation dictionary encompassing both the source
+and target languages, we map tokens from the target tokenizer to semantically
+similar tokens from the source language tokenizer. This one-to-many token
+mapping improves tremendously the initialization of the embedding table for the
+target language. We conduct experiments to convert high-resource models to mid-
+and low-resource languages, namely Dutch and Frisian. These converted models
+achieve a new state-of-the-art performance on these languages across all sorts
+of downstream tasks. By reducing significantly the amount of data and time
+required for training state-of-the-art models, our novel model conversion
+strategy has the potential to benefit many languages worldwide.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>As first reviewed at TACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controllable Multi-document Summarization: Coverage & Coherence
+  Intuitive Policy with Large Language Model Based Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03473v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03473v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Litton J Kurisinkel, Nancy F chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Memory-efficient large language models are good at refining text input for
+better readability. However, controllability is a matter of concern when it
+comes to text generation tasks with long inputs, such as multi-document
+summarization. In this work, we investigate for a generic controllable approach
+for multi-document summarization that leverages the capabilities of LLMs to
+refine the text. In particular, we train a controllable content extraction
+scheme to extract the text that will be refined by an LLM. The scheme is
+designed with a novel coverage and coherence intuitive policy, which is duly
+rewarded by a passively trained LLM. Our approach yields competitive results in
+the evaluation using ROUGE metrics and outperforms potential baselines in
+coherence, as per human evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The North System for Formosa Speech Recognition Challenge 2023 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li-Wei Chen, Kai-Chen Cheng, Hung-Shin Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report provides a concise overview of the proposed North system, which
+aims to achieve automatic word/syllable recognition for Taiwanese Hakka
+(Sixian). The report outlines three key components of the system: the
+acquisition, composition, and utilization of the training data; the
+architecture of the model; and the hardware specifications and operational
+statistics. The demonstration of the system can be found at
+https://asrvm.iis.sinica.edu.tw/hakka_sixian.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Language Model Pruning for Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Emili, Thiago Fraga-Silva, Ernest Pusateri, Markus Nußbaum-Thom, Youssef Oualil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study model pruning methods applied to Transformer-based neural network
+language models for automatic speech recognition. We explore three aspects of
+the pruning frame work, namely criterion, method and scheduler, analyzing their
+contribution in terms of accuracy and inference speed. To the best of our
+knowledge, such in-depth analyses on large-scale recognition systems has not
+been reported in the literature. In addition, we propose a variant of low-rank
+approximation suitable for incrementally compressing models, and delivering
+multiple models with varied target sizes. Among other results, we show that a)
+data-driven pruning outperforms magnitude-driven in several scenarios; b)
+incremental pruning achieves higher accuracy compared to one-shot pruning,
+especially when targeting smaller sizes; and c) low-rank approximation presents
+the best trade-off between size reduction and inference speed-up for moderate
+compression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Based Multi-Document Summarization Exploiting Main-Event Biased
+  Monotone Submodular Content Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03414v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03414v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Litton J Kurisinkel, Nancy F. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-document summarization is a challenging task due to its inherent
+subjective bias, highlighted by the low inter-annotator ROUGE-1 score of 0.4
+among DUC-2004 reference summaries. In this work, we aim to enhance the
+objectivity of news summarization by focusing on the main event of a group of
+related news documents and presenting it coherently with sufficient context.
+Our primary objective is to succinctly report the main event, ensuring that the
+summary remains objective and informative. To achieve this, we employ an
+extract-rewrite approach that incorporates a main-event biased
+monotone-submodular function for content selection. This enables us to extract
+the most crucial information related to the main event from the document
+cluster. To ensure coherence, we utilize a fine-tuned Language Model (LLM) for
+rewriting the extracted content into a coherent text. The evaluation using
+objective metrics and human evaluators confirms the effectiveness of our
+approach, as it surpasses potential baselines, demonstrating excellence in both
+content coverage, coherence, and informativeness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Procedural Text Mining with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anisa Rula, Jennifer D'Souza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in the field of Natural Language Processing, particularly
+the development of large-scale language models that are pretrained on vast
+amounts of knowledge, are creating novel opportunities within the realm of
+Knowledge Engineering. In this paper, we investigate the usage of large
+language models (LLMs) in both zero-shot and in-context learning settings to
+tackle the problem of extracting procedures from unstructured PDF text in an
+incremental question-answering fashion. In particular, we leverage the current
+state-of-the-art GPT-4 (Generative Pre-trained Transformer 4) model,
+accompanied by two variations of in-context learning that involve an ontology
+with definitions of procedures and steps and a limited number of samples of
+few-shot learning. The findings highlight both the promise of this approach and
+the value of the in-context learning customisations. These modifications have
+the potential to significantly address the challenge of obtaining sufficient
+training data, a hurdle often encountered in deep learning-based Natural
+Language Processing techniques for procedure extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, Accepted to The Twelfth International Conference
+  on Knowledge Capture (K-Cap 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Hallucinations in Chinese Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinyuan Cheng, Tianxiang Sun, Wenwei Zhang, Siyin Wang, Xiangyang Liu, Mozhi Zhang, Junliang He, Mianqiu Huang, Zhangyue Yin, Kai Chen, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we establish a benchmark named HalluQA (Chinese Hallucination
+Question-Answering) to measure the hallucination phenomenon in Chinese large
+language models. HalluQA contains 450 meticulously designed adversarial
+questions, spanning multiple domains, and takes into account Chinese historical
+culture, customs, and social phenomena. During the construction of HalluQA, we
+consider two types of hallucinations: imitative falsehoods and factual errors,
+and we construct adversarial samples based on GLM-130B and ChatGPT. For
+evaluation, we design an automated evaluation method using GPT-4 to judge
+whether a model output is hallucinated. We conduct extensive experiments on 24
+large language models, including ERNIE-Bot, Baichuan2, ChatGLM, Qwen, SparkDesk
+and etc. Out of the 24 models, 18 achieved non-hallucination rates lower than
+50%. This indicates that HalluQA is highly challenging. We analyze the primary
+types of hallucinations in different types of models and their causes.
+Additionally, we discuss which types of hallucinations should be prioritized
+for different types of models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reformulating Domain Adaptation of Large Language Models as
+  Adapt-Retrieve-Revise <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen wan, Yating Zhang, Yexiang Wang, Fei Cheng, Sadao Kurohashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) like GPT-4 have recently demonstrated
+astonishing zero-shot capabilities in general domain tasks, they often generate
+content with hallucinations in specific domains such as Chinese law, hindering
+their application in these areas. This is typically due to the absence of
+training data that encompasses such a specific domain, preventing GPT-4 from
+acquiring in-domain knowledge. A pressing challenge is that it's not plausible
+to continue training LLMs of such scale on in-domain data.
+  This paper introduces a simple and effective domain adaptation framework for
+GPT-4 by reformulating generation as an \textbf{adapt-retrieve-revise} process.
+The initial step is to \textbf{adapt} an affordable 7B LLM to the target domain
+by continuing learning on in-domain data. When solving a task, we leverage the
+adapted LLM to generate a draft answer given a task query. Then, the draft
+answer will be used to \textbf{retrieve} supporting evidence candidates from an
+external in-domain knowledge base. Finally, the draft answer and retrieved
+evidence are concatenated into a whole prompt to let GPT-4 assess the evidence
+and \textbf{revise} the draft answer to generate the final answer.
+  Our proposal combines the advantages of the efficiency of adapting a smaller
+7B model with the evidence-assessing capability of GPT-4 and effectively
+prevents GPT-4 from generating hallucinatory content. In the zero-shot setting
+of four Chinese legal tasks, our method improves accuracy by 33.3\% compared to
+the direct generation by GPT-4. When compared to two stronger retrieval-based
+baselines, our method outperforms them by 15.4\% and 23.9\%. Our code will be
+released
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under submission to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concise and Organized Perception Facilitates Large Language Models for
+  Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaotian Yan, Chen Shen, Junjie Liu, Jieping Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploiting large language models (LLMs) to tackle deductive reasoning has
+garnered growing attention. It still remains highly challenging to achieve
+satisfactory results in complex deductive problems, characterized by plenty of
+premises (i.e., facts or rules) entailing intricate relationships among
+entities and requiring multi-hop reasoning. One intuitive solution is to
+decompose the original task into smaller sub-tasks, and then chain the multiple
+casual reasoning steps together in a forward (e.g., Selection-Inference) or
+backward (e.g., LAMBADA) direction. However, these techniques inevitably
+necessitate a large number of overall stages, leading to computationally
+expensive operations and a higher possibility of making misleading steps. In
+addition to stage-by-stage decomposition, we draw inspiration from another
+aspect of human problem-solving. Humans tend to distill the most relevant
+information and organize their thoughts systematically (e.g., creating mind
+maps), which assists them in answering questions or drawing conclusions
+precisely and quickly. In light of this, we propose a novel reasoning approach
+named Concise and Organized Perception (COP). COP carefully analyzes the given
+statements to efficiently identify the most pertinent information while
+eliminating redundancy. It then prompts the LLMs in a more organized form that
+adapts to the model's inference process. By perceiving concise and organized
+proofs, the deductive reasoning abilities of LLMs can be better elicited, and
+the risk of acquiring errors caused by excessive reasoning stages is mitigated.
+Furthermore, our approach can be combined with the aforementioned ones to
+further boost their performance. Extensive experimental results on three
+popular deductive benchmarks (i.e., ProofWriter, PrOntoQA and PrOntoQA-OOD)
+show that COP significantly outperforms previous state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Personalized Story Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danqing Wang, Kevin Yang, Hanlin Zhu, Xiaomeng Yang, Andrew Cohen, Lei Li, Yuandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) have shown impressive results for more
+objective tasks such as QA and retrieval, it remains nontrivial to evaluate
+their performance on open-ended text generation for reasons including (1) data
+contamination; (2) multi-dimensional evaluation criteria; and (3)
+subjectiveness stemming from reviewers' personal preferences. To address such
+issues, we propose to model personalization in an uncontaminated open-ended
+generation assessment. We create two new datasets Per-MPST and Per-DOC for
+personalized story evaluation, by re-purposing existing datasets with proper
+anonymization and new personalized labels. We further develop a personalized
+story evaluation model PERSE to infer reviewer preferences and provide a
+personalized evaluation. Specifically, given a few exemplary reviews from a
+particular reviewer, PERSE predicts either a detailed review or fine-grained
+comparison in several aspects (such as interestingness and surprise) for that
+reviewer on a new text input. Experimental results show that PERSE outperforms
+GPT-4 by 15.8% on Kendall correlation of story ratings, and by 13.7% on
+pairwise preference prediction accuracy. Both datasets and code will be
+released at https://github.com/dqwang122/PerSE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Dialogue Response Generation Agent for Large Language Models by
+  Asking Questions to Detect User's Intentions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siwei Wu, Xiangqing Shen, Rui Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), such as ChatGPT, have recently been applied to
+various NLP tasks due to its open-domain generation capabilities. However,
+there are two issues with applying LLMs to dialogue tasks. 1. During the
+dialogue process, users may have implicit intentions that might be overlooked
+by LLMs. Consequently, generated responses couldn't align with the user's
+intentions. 2. It is unlikely for LLMs to encompass all fields comprehensively.
+In certain specific domains, their knowledge may be incomplete, and LLMs cannot
+update the latest knowledge in real-time. To tackle these issues, we propose a
+framework~\emph{using LLM to \textbf{E}nhance dialogue response generation by
+asking questions to \textbf{D}etect user's \textbf{I}mplicit
+in\textbf{T}entions} (\textbf{EDIT}). Firstly, EDIT generates open questions
+related to the dialogue context as the potential user's intention; Then, EDIT
+answers those questions by interacting with LLMs and searching in
+domain-specific knowledge bases respectively, and use LLMs to choose the proper
+answers to questions as extra knowledge; Finally, EDIT enhances response
+generation by explicitly integrating those extra knowledge. Besides, previous
+question generation works only focus on asking questions with answers in
+context. In order to ask open questions, we construct a Context-Open-Question
+(COQ) dataset. On two task-oriented dialogue tasks (Wizard of Wikipedia and
+Holl-E), EDIT outperformed other LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Formalism and Approach for Improving Robustness of Large Language
+  Models Using Risk-Adjusted Confidence Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Shen, Mayank Kejriwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), such as ChatGPT, have achieved impressive
+milestones in natural language processing (NLP). Despite their impressive
+performance, the models are known to pose important risks. As these models are
+deployed in real-world applications, a systematic understanding of different
+risks posed by these models on tasks such as natural language inference (NLI),
+is much needed. In this paper, we define and formalize two distinct types of
+risk: decision risk and composite risk. We also propose a risk-centric
+evaluation framework, and four novel metrics, for assessing LLMs on these risks
+in both in-domain and out-of-domain settings. Finally, we propose a
+risk-adjusted calibration method called DwD for helping LLMs minimize these
+risks in an overall NLI architecture. Detailed experiments, using four NLI
+benchmarks, three baselines and two LLMs, including ChatGPT, show both the
+practical utility of the evaluation framework, and the efficacy of DwD in
+reducing decision and composite risk. For instance, when using DwD, an
+underlying LLM is able to address an extra 20.1% of low-risk inference tasks
+(but which the LLM erroneously deems high-risk without risk adjustment) and
+skip a further 19.8% of high-risk tasks, which would have been answered
+incorrectly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructProtein: Aligning Human and Protein Language via Knowledge
+  Instruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Wang, Qiang Zhang, Keyan Ding, Ming Qin, Xiang Zhuang, Xiaotong Li, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized the field of natural
+language processing, but they fall short in comprehending biological sequences
+such as proteins. To address this challenge, we propose InstructProtein, an
+innovative LLM that possesses bidirectional generation capabilities in both
+human and protein languages: (i) taking a protein sequence as input to predict
+its textual function description and (ii) using natural language to prompt
+protein sequence generation. To achieve this, we first pre-train an LLM on both
+protein and natural language corpora, enabling it to comprehend individual
+languages. Then supervised instruction tuning is employed to facilitate the
+alignment of these two distinct languages. Herein, we introduce a knowledge
+graph-based instruction generation framework to construct a high-quality
+instruction dataset, addressing annotation imbalance and instruction deficits
+in existing protein-text corpus. In particular, the instructions inherit the
+structural relations between proteins and function annotations in knowledge
+graphs, which empowers our model to engage in the causal modeling of protein
+functions, akin to the chain-of-thought processes in natural languages.
+Extensive experiments on bidirectional protein-text generation tasks show that
+InstructProtein outperforms state-of-the-art LLMs by large margins. Moreover,
+InstructProtein serves as a pioneering step towards text-based protein function
+prediction and sequence design, effectively bridging the gap between protein
+and human language understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlock Predictable Scaling from Emergent Abilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengding Hu, Xin Liu, Xu Han, Xinrong Zhang, Chaoqun He, Weilin Zhao, Yankai Lin, Ning Ding, Zebin Ou, Guoyang Zeng, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scientific scale-up of large language models (LLMs) necessitates a
+comprehensive understanding of their scaling properties. However, the existing
+literature on the scaling properties only yields an incomplete answer:
+optimization loss decreases predictably as the model size increases, in line
+with established scaling law; yet no scaling law for task has been established
+and the task performances are far from predictable during scaling. Task
+performances typically show minor gains on small models until they improve
+dramatically once models exceed a size threshold, exemplifying the ``emergent
+abilities''. In this study, we discover that small models, although they
+exhibit minor performance, demonstrate critical and consistent task performance
+improvements that are not captured by conventional evaluation strategies due to
+insufficient measurement resolution. To measure such improvements, we introduce
+PassUntil, an evaluation strategy through massive sampling in the decoding
+phase. We conduct quantitative investigations into the scaling law of task
+performance. Firstly, a strict task scaling law is identified, enhancing the
+predictability of task performances. Remarkably, we are able to predict the
+performance of the 2.4B model on code generation with merely 0.05\% deviation
+before training starts. Secondly, underpinned by PassUntil, we observe concrete
+evidence of emergent abilities and ascertain that they are not in conflict with
+the continuity of performance improvement. Their semblance to break-through is
+that their scaling curve cannot be fitted by standard scaling law function. We
+then introduce a mathematical definition for the emergent abilities. Through
+the definition, we refute a prevalent ``multi-step reasoning hypothesis''
+regarding the genesis of emergent abilities and propose a new hypothesis with a
+satisfying fit to the observed scaling curve.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages main paper, 8 pages appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Large Language Models be Good Path Planners? A Benchmark and
+  Investigation on Spatial-temporal Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Aghzal, Erion Plaku, Ziyu Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable success across a wide
+spectrum of tasks; however, they still face limitations in scenarios that
+demand long-term planning and spatial reasoning. To facilitate this line of
+research, in this work, we propose a new benchmark, termed $\textbf{P}$ath
+$\textbf{P}$lanning from $\textbf{N}$atural $\textbf{L}$anguage
+($\textbf{PPNL}$). Our benchmark evaluates LLMs' spatial-temporal reasoning by
+formulating ''path planning'' tasks that require an LLM to navigate to target
+locations while avoiding obstacles and adhering to constraints. Leveraging this
+benchmark, we systematically investigate LLMs including GPT-4 via different
+few-shot prompting methodologies and BART and T5 of various sizes via
+fine-tuning. Our experimental results show the promise of few-shot GPT-4 in
+spatial reasoning, when it is prompted to reason and act interleavedly,
+although it still fails to make long-term temporal reasoning. In contrast,
+while fine-tuned LLMs achieved impressive results on in-distribution reasoning
+tasks, they struggled to generalize to larger environments or environments with
+more obstacles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Representations of First-person Pronouns for Prediction of
+  Depression Symptom Severity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Ren, Hannah A Burkhardt, Patricia A Areán, Thomas D Hull, Trevor Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior work has shown that analyzing the use of first-person singular pronouns
+can provide insight into individuals' mental status, especially depression
+symptom severity. These findings were generated by counting frequencies of
+first-person singular pronouns in text data. However, counting doesn't capture
+how these pronouns are used. Recent advances in neural language modeling have
+leveraged methods generating contextual embeddings. In this study, we sought to
+utilize the embeddings of first-person pronouns obtained from contextualized
+language representation models to capture ways these pronouns are used, to
+analyze mental status. De-identified text messages sent during online
+psychotherapy with weekly assessment of depression severity were used for
+evaluation. Results indicate the advantage of contextualized first-person
+pronoun embeddings over standard classification token embeddings and
+frequency-based pronoun analysis results in predicting depression symptom
+severity. This suggests contextual representations of first-person pronouns can
+enhance the predictive utility of language used by people with depression
+symptoms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted: AMIA Annual Symposium 2023. To appear as: Ren X, Burkhardt
+  H, Are\'an P, Hull T, Cohen T. Deep Representations of First-person Pronouns
+  for Prediction of Depression Symptom Severity. AMIA Annual Symposium
+  Proceedings 2023. American Medical Informatics Association</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FreshLLMs: Refreshing Large Language Models with Search Engine
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tu Vu, Mohit Iyyer, Xuezhi Wang, Noah Constant, Jerry Wei, Jason Wei, Chris Tar, Yun-Hsuan Sung, Denny Zhou, Quoc Le, Thang Luong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most large language models (LLMs) are trained once and never updated; thus,
+they lack the ability to dynamically adapt to our ever-changing world. In this
+work, we perform a detailed study of the factuality of LLM-generated text in
+the context of answering questions that test current world knowledge.
+Specifically, we introduce FreshQA, a novel dynamic QA benchmark encompassing a
+diverse range of question and answer types, including questions that require
+fast-changing world knowledge as well as questions with false premises that
+need to be debunked. We benchmark a diverse array of both closed and
+open-source LLMs under a two-mode evaluation procedure that allows us to
+measure both correctness and hallucination. Through human evaluations involving
+more than 50K judgments, we shed light on limitations of these models and
+demonstrate significant room for improvement: for instance, all models
+(regardless of model size) struggle on questions that involve fast-changing
+knowledge and false premises. Motivated by these results, we present
+FreshPrompt, a simple few-shot prompting method that substantially boosts the
+performance of an LLM on FreshQA by incorporating relevant and up-to-date
+information retrieved from a search engine into the prompt. Our experiments
+show that FreshPrompt outperforms both competing search engine-augmented
+prompting methods such as Self-Ask (Press et al., 2022) as well as commercial
+systems such as Perplexity.AI. Further analysis of FreshPrompt reveals that
+both the number of retrieved evidences and their order play a key role in
+influencing the correctness of LLM-generated answers. Additionally, instructing
+the LLM to generate concise and direct answers helps reduce hallucination
+compared to encouraging more verbose answers. To facilitate future work, we
+release FreshQA at github.com/freshllms/freshqa and commit to updating it at
+regular intervals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, 22 pages, 7 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DQ-LoRe: Dual Queries with Low Rank Approximation Re-ranking for
+  In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02954v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02954v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiong Xiong, Zixuan Li, Chuanyang Zheng, Zhijiang Guo, Yichun Yin, Enze Xie, Zhicheng Yang, Qingxing Cao, Haiming Wang, Xiongwei Han, Jing Tang, Chengming Li, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in natural language processing, primarily propelled by Large
+Language Models (LLMs), have showcased their remarkable capabilities grounded
+in in-context learning. A promising avenue for guiding LLMs in intricate
+reasoning tasks involves the utilization of intermediate reasoning steps within
+the Chain-of-Thought (CoT) paradigm. Nevertheless, the central challenge lies
+in the effective selection of exemplars for facilitating in-context learning.
+In this study, we introduce a framework that leverages Dual Queries and
+Low-rank approximation Re-ranking (DQ-LoRe) to automatically select exemplars
+for in-context learning. Dual Queries first query LLM to obtain LLM-generated
+knowledge such as CoT, then query the retriever to obtain the final exemplars
+via both question and the knowledge. Moreover, for the second query, LoRe
+employs dimensionality reduction techniques to refine exemplar selection,
+ensuring close alignment with the input question's knowledge. Through extensive
+experiments, we demonstrate that DQ-LoRe significantly outperforms prior
+state-of-the-art methods in the automatic selection of exemplars for GPT-4,
+enhancing performance from 92.5\% to 94.2\%. Our comprehensive analysis further
+reveals that DQ-LoRe consistently outperforms retrieval-based approaches in
+terms of both performance and adaptability, especially in scenarios
+characterized by distribution shifts. DQ-LoRe pushes the boundaries of
+in-context learning and opens up new avenues for addressing complex reasoning
+challenges. We will release the code soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explaining Emergent In-Context Learning as Kernel Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Han, Ziqi Wang, Han Zhao, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have initiated a paradigm shift in transfer
+learning. In contrast to the classic pretraining-then-finetuning procedure, in
+order to use LLMs for downstream prediction tasks, one only needs to provide a
+few demonstrations, known as in-context examples, without adding more or
+updating existing model parameters. This in-context learning (ICL) capability
+of LLMs is intriguing, and it is not yet fully understood how pretrained LLMs
+acquire such capabilities. In this paper, we investigate the reason why a
+transformer-based language model can accomplish in-context learning after
+pre-training on a general language corpus by proposing one hypothesis that LLMs
+can simulate kernel regression with internal representations when faced with
+in-context examples. More concretely, we first prove that Bayesian inference on
+in-context prompts can be asymptotically understood as kernel regression $\hat
+y = \sum_i y_i K(x, x_i)/\sum_i K(x, x_i)$ as the number of in-context
+demonstrations grows. Then, we empirically investigate the in-context behaviors
+of language models. We find that during ICL, the attention and hidden features
+in LLMs match the behaviors of a kernel regression. Finally, our theory
+provides insights into multiple phenomena observed in the ICL field: why
+retrieving demonstrative samples similar to test samples can help, why ICL
+performance is sensitive to the output formats, and why ICL accuracy benefits
+from selecting in-distribution and representative samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LC-Score: Reference-less estimation of Text Comprehension Difficulty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Tardy, Charlotte Roze, Paul Poupet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to read and understand written text is critical in a digital era.
+However, studies shows that a large fraction of the population experiences
+comprehension issues. In this context, further initiatives in accessibility are
+required to improve the audience text comprehension. However, writers are
+hardly assisted nor encouraged to produce easy-to-understand content. Moreover,
+Automatic Text Simplification (ATS) model development suffers from the lack of
+metric to accurately estimate comprehension difficulty We present
+\textsc{LC-Score}, a simple approach for training text comprehension metric for
+any French text without reference \ie predicting how easy to understand a given
+text is on a $[0, 100]$ scale. Our objective with this scale is to
+quantitatively capture the extend to which a text suits to the \textit{Langage
+Clair} (LC, \textit{Clear Language}) guidelines, a French initiative closely
+related to English Plain Language. We explore two approaches: (i) using
+linguistically motivated indicators used to train statistical models, and (ii)
+neural learning directly from text leveraging pre-trained language models. We
+introduce a simple proxy task for comprehension difficulty training as a
+classification task. To evaluate our models, we run two distinct human
+annotation experiments, and find that both approaches (indicator based and
+neural) outperforms commonly used readability and comprehension metrics such as
+FKGL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Inferential Reproducibility of Machine Learning Research <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04054v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04054v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Hagmann, Philipp Meier, Stefan Riezler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliability of machine learning evaluation -- the consistency of observed
+evaluation scores across replicated model training runs -- is affected by
+several sources of nondeterminism which can be regarded as measurement noise.
+Current tendencies to remove noise in order to enforce reproducibility of
+research results neglect inherent nondeterminism at the implementation level
+and disregard crucial interaction effects between algorithmic noise factors and
+data properties. This limits the scope of conclusions that can be drawn from
+such experiments. Instead of removing noise, we propose to incorporate several
+sources of variance, including their interaction with data properties, into an
+analysis of significance and reliability of machine learning evaluation, with
+the aim to draw inferences beyond particular instances of trained models. We
+show how to use linear mixed effects models (LMEMs) to analyze performance
+evaluation scores, and to conduct statistical inference with a generalized
+likelihood ratio test (GLRT). This allows us to incorporate arbitrary sources
+of noise like meta-parameter variations into statistical significance testing,
+and to assess performance differences conditional on data properties.
+Furthermore, a variance component analysis (VCA) enables the analysis of the
+contribution of noise sources to overall variance and the computation of a
+reliability coefficient by the ratio of substantial to total variance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large-scale investigation of weakly-supervised deep learning for the
+  fine-grained semantic indexing of biomedical literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasios Nentidis, Thomas Chatzopoulos, Anastasia Krithara, Grigorios Tsoumakas, Georgios Paliouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: Semantic indexing of biomedical literature is usually done at the
+level of MeSH descriptors with several related but distinct biomedical concepts
+often grouped together and treated as a single topic. This study proposes a new
+method for the automated refinement of subject annotations at the level of MeSH
+concepts. Methods: Lacking labelled data, we rely on weak supervision based on
+concept occurrence in the abstract of an article, which is also enhanced by
+dictionary-based heuristics. In addition, we investigate deep learning
+approaches, making design choices to tackle the particular challenges of this
+task. The new method is evaluated on a large-scale retrospective scenario,
+based on concepts that have been promoted to descriptors. Results: In our
+experiments concept occurrence was the strongest heuristic achieving a macro-F1
+score of about 0.63 across several labels. The proposed method improved it
+further by more than 4pp. Conclusion: The results suggest that concept
+occurrence is a strong heuristic for refining the coarse-grained labels at the
+level of MeSH concepts and the proposed method improves it further.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 5 figures, 4 tables. A more concise version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00436v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00436v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Miao, Yee Whye Teh, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in large language models (LLMs), especially the invention
+of chain-of-thought prompting, has made it possible to automatically answer
+questions by stepwise reasoning. However, when faced with more complicated
+problems that require non-linear thinking, even the strongest LLMs make
+mistakes. To address this, we explore whether LLMs are able to recognize errors
+in their own step-by-step reasoning, without resorting to external resources.
+To this end, we propose SelfCheck, a general-purpose zero-shot verification
+schema for recognizing such errors. We then use the results of these checks to
+improve question-answering performance by conducting weighted voting on
+multiple solutions to the question. We test SelfCheck on three datasets (GSM8K,
+MathQA, and MATH) and find that it successfully recognizes errors and, in turn,
+increases final answer accuracies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the definition of toxicity in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Berezin, Reza Farahbakhsh, Noel Crespi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fundamental problem in toxicity detection task lies in the fact that the
+toxicity is ill-defined. This causes us to rely on subjective and vague data in
+models' training, which results in non-robust and non-accurate results: garbage
+in - garbage out.
+  This work suggests a new, stress-level-based definition of toxicity designed
+to be objective and context-aware. On par with it, we also describe possible
+ways of applying this new definition to dataset creation and model training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Large Language Models for Qualitative Analysis can Introduce
+  Serious Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17147v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17147v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Ashwin, Aditya Chhabra, Vijayendra Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are quickly becoming ubiquitous, but the
+implications for social science research are not yet well understood. This
+paper asks whether LLMs can help us analyse large-N qualitative data from
+open-ended interviews, with an application to transcripts of interviews with
+Rohingya refugees in Cox's Bazaar, Bangladesh. We find that a great deal of
+caution is needed in using LLMs to annotate text as there is a risk of
+introducing biases that can lead to misleading inferences. We here mean bias in
+the technical sense, that the errors that LLMs make in annotating interview
+transcripts are not random with respect to the characteristics of the interview
+subjects. Training simpler supervised models on high-quality human annotations
+with flexible coding leads to less measurement error and bias than LLM
+annotations. Therefore, given that some high quality annotations are necessary
+in order to asses whether an LLM introduces bias, we argue that it is probably
+preferable to train a bespoke model on these annotations than it is to use an
+LLM for annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BA-SOT: Boundary-Aware Serialized Output Training for Multi-Talker ASR <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13716v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13716v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Liang, Fan Yu, Yangze Li, Pengcheng Guo, Shiliang Zhang, Qian Chen, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed serialized output training (SOT) simplifies
+multi-talker automatic speech recognition (ASR) by generating speaker
+transcriptions separated by a special token. However, frequent speaker changes
+can make speaker change prediction difficult. To address this, we propose
+boundary-aware serialized output training (BA-SOT), which explicitly
+incorporates boundary knowledge into the decoder via a speaker change detection
+task and boundary constraint loss. We also introduce a two-stage connectionist
+temporal classification (CTC) strategy that incorporates token-level SOT CTC to
+restore temporal context information. Besides typical character error rate
+(CER), we introduce utterance-dependent character error rate (UD-CER) to
+further measure the precision of speaker change prediction. Compared to
+original SOT, BA-SOT reduces CER/UD-CER by 5.1%/14.0%, and leveraging a
+pre-trained ASR model for BA-SOT model initialization further reduces
+CER/UD-CER by 8.4%/19.9%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling Human Sentence Processing with Left-Corner Recurrent Neural
+  Network Grammars <span class="chip">EMNLP 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.04939v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.04939v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Yoshida, Hiroshi Noji, Yohei Oseki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In computational linguistics, it has been shown that hierarchical structures
+make language models (LMs) more human-like. However, the previous literature
+has been agnostic about a parsing strategy of the hierarchical models. In this
+paper, we investigated whether hierarchical structures make LMs more
+human-like, and if so, which parsing strategy is most cognitively plausible. In
+order to address this question, we evaluated three LMs against human reading
+times in Japanese with head-final left-branching structures: Long Short-Term
+Memory (LSTM) as a sequential model and Recurrent Neural Network Grammars
+(RNNGs) with top-down and left-corner parsing strategies as hierarchical
+models. Our computational modeling demonstrated that left-corner RNNGs
+outperformed top-down RNNGs and LSTM, suggesting that hierarchical and
+left-corner architectures are more cognitively plausible than top-down or
+sequential architectures. In addition, the relationships between the cognitive
+plausibility and (i) perplexity, (ii) parsing, and (iii) beam size will also be
+discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Overview</span> of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06435v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06435v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Humza Naveed, Asad Ullah Khan, Shi Qiu, Muhammad Saqib, Saeed Anwar, Muhammad Usman, Naveed Akhtar, Nick Barnes, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently demonstrated remarkable
+capabilities in natural language processing tasks and beyond. This success of
+LLMs has led to a large influx of research contributions in this direction.
+These works encompass diverse topics such as architectural innovations of the
+underlying neural networks, context length improvements, model alignment,
+training datasets, benchmarking, efficiency and more. With the rapid
+development of techniques and regular breakthroughs in LLM research, it has
+become considerably challenging to perceive the bigger picture of the advances
+in this direction. Considering the rapidly emerging plethora of literature on
+LLMs, it is imperative that the research community is able to benefit from a
+concise yet comprehensive overview of the recent developments in this field.
+This article provides that overview to the research community. It not only
+focuses on a systematic treatment of the existing literature on a broad range
+of LLM related concept, but also pays special attention to providing
+comprehensive summaries with extensive details about the individual existing
+models, datasets and major insights. We also pay heed to aligning our overview
+with the emerging outlook of this research direction by accounting for the
+other recently materializing reviews of the broader research direction of LLMs.
+Our self-contained comprehensive overview of LLMs discusses relevant background
+concepts along with covering the advanced topics at the frontier of this
+research direction. This review article is intended to not only provide a
+systematic survey, but also a quick comprehensive reference for the researchers
+and practitioners to draw insights from extensive informative summaries of the
+existing works to advance the LLM research direction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in-progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Emotion Role Labeling and Appraisal-based Emotion Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The term emotion analysis in text subsumes various natural language
+processing tasks which have in common the goal to enable computers to
+understand emotions. Most popular is emotion classification in which one or
+multiple emotions are assigned to a predefined textual unit. While such setting
+is appropriate to identify the reader's or author's emotion, emotion role
+labeling adds the perspective of mentioned entities and extracts text spans
+that correspond to the emotion cause. The underlying emotion theories agree on
+one important point; that an emotion is caused by some internal or external
+event and comprises several subcomponents, including the subjective feeling and
+a cognitive evaluation. We therefore argue that emotions and events are related
+in two ways. (1) Emotions are events; and this perspective is the fundament in
+NLP for emotion role labeling. (2) Emotions are caused by events; a perspective
+that is made explicit with research how to incorporate psychological appraisal
+theories in NLP models to interpret events. These two research directions, role
+labeling and (event-focused) emotion classification, have by and large been
+tackled separately. We contributed to both directions with the projects SEAT
+(Structured Multi-Domain Emotion Analysis from Text) and CEAT (Computational
+Event Evaluation based on Appraisal Theories for Emotion Analysis), both funded
+by the German Research Foundation. In this paper, we consolidate the findings
+and discuss open research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to the Big Picture Workshop
+  (https://bigpictureworkshop.com/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Teaching Large Language Models to Self-Debug 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyun Chen, Maxwell Lin, Nathanael Schärli, Denny Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved impressive performance on code
+generation. However, for complex programming tasks, generating the correct
+solution in one go becomes challenging, thus some prior works have designed
+program repair approaches to improve code generation performance. In this work,
+we propose Self-Debugging, which teaches a large language model to debug its
+predicted program via few-shot demonstrations. In particular, we demonstrate
+that Self-Debugging can teach the large language model to perform rubber duck
+debugging; i.e., without any human feedback on the code correctness or error
+messages, the model is able to identify its mistakes by investigating the
+execution results and explaining the generated code in natural language.
+Self-Debugging achieves the state-of-the-art performance on several code
+generation benchmarks, including the Spider dataset for text-to-SQL generation,
+TransCoder for C++-to-Python translation, and MBPP for text-to-Python
+generation. On the Spider benchmark where there are no unit tests to verify the
+correctness of predictions, Self-Debugging with code explanation consistently
+improves the baseline by 2-3%, and improves the prediction accuracy on problems
+of the hardest level by 9%. On TransCoder and MBPP where unit tests are
+available, Self-Debugging improves the baseline accuracy by up to 12%.
+Meanwhile, by leveraging feedback messages and reusing failed predictions,
+Self-Debugging notably improves sample efficiency, and can match or outperform
+baseline models that generate more than 10x candidate programs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DyVal: Graph-informed Dynamic Evaluation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijie Zhu, Jiaao Chen, Jindong Wang, Neil Zhenqiang Gong, Diyi Yang, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable performance in various
+evaluation benchmarks. However, concerns about their performance are raised on
+potential data contamination in their considerable volume of training corpus.
+Moreover, the static nature and fixed complexity of current benchmarks may
+inadequately gauge the advancing capabilities of LLMs. In this paper, we
+introduce DyVal, a novel, general, and flexible evaluation protocol for dynamic
+evaluation of LLMs. Based on our proposed dynamic evaluation framework, we
+build graph-informed DyVal by leveraging the structural advantage of directed
+acyclic graphs to dynamically generate evaluation samples with controllable
+complexities. DyVal generates challenging evaluation sets on reasoning tasks
+including mathematics, logical reasoning, and algorithm problems. We evaluate
+various LLMs ranging from Flan-T5-large to ChatGPT and GPT4. Experiments
+demonstrate that LLMs perform worse in DyVal-generated evaluation samples with
+different complexities, emphasizing the significance of dynamic evaluation. We
+also analyze the failure cases and results of different prompting methods.
+Moreover, DyVal-generated samples are not only evaluation sets, but also
+helpful data for fine-tuning to improve the performance of LLMs on existing
+benchmarks. We hope that DyVal can shed light on the future evaluation research
+of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report; 36 pages; code will be released at aka.ms/dyval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Annotation Imputation to Individualize Predictions: Initial Studies on
+  Distribution Dynamics and Model Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15070v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15070v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        London Lowmanstone, Ruyuan Wan, Risako Owan, Jaehyung Kim, Dongyeop Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annotating data via crowdsourcing is time-consuming and expensive. Due to
+these costs, dataset creators often have each annotator label only a small
+subset of the data. This leads to sparse datasets with examples that are marked
+by few annotators. The downside of this process is that if an annotator doesn't
+get to label a particular example, their perspective on it is missed. This is
+especially concerning for subjective NLP datasets where there is no single
+correct label: people may have different valid opinions. Thus, we propose using
+imputation methods to generate the opinions of all annotators for all examples,
+creating a dataset that does not leave out any annotator's view. We then train
+and prompt models, using data from the imputed dataset, to make predictions
+about the distribution of responses and individual annotations.
+  In our analysis of the results, we found that the choice of imputation method
+significantly impacts soft label changes and distribution. While the imputation
+introduces noise in the prediction of the original dataset, it has shown
+potential in enhancing shots for prompts, particularly for low-response-rate
+annotators. We have made all of our code and data publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NLPerspectives - 2nd Workshop on Perspectivist Approaches to NLP, 39
+  pages, 13 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GOAL: A Challenging Knowledge-grounded Video Captioning Benchmark for
+  Real-time Soccer Commentary Generation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Qi, Jifan Yu, Teng Tu, Kunyu Gao, Yifan Xu, Xinyu Guan, Xiaozhi Wang, Yuxiao Dong, Bin Xu, Lei Hou, Juanzi Li, Jie Tang, Weidong Guo, Hui Liu, Yu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent emergence of video captioning models, how to generate
+vivid, fine-grained video descriptions based on the background knowledge (i.e.,
+long and informative commentary about the domain-specific scenes with
+appropriate reasoning) is still far from being solved, which however has great
+applications such as automatic sports narrative. In this paper, we present
+GOAL, a benchmark of over 8.9k soccer video clips, 22k sentences, and 42k
+knowledge triples for proposing a challenging new task setting as
+Knowledge-grounded Video Captioning (KGVC). Moreover, we conduct experimental
+adaption of existing methods to show the difficulty and potential directions
+for solving this valuable and applicable task. Our data and code are available
+at https://github.com/THU-KEG/goal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ring Attention with Blockwise <span class="highlight-title">Transformer</span>s for Near-Infinite Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01889v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01889v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Matei Zaharia, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as the architecture of choice for many
+state-of-the-art AI models, showcasing exceptional performance across a wide
+range of AI applications. However, the memory demands imposed by Transformers
+limit their ability to handle long sequences, thereby creating challenges for
+tasks involving extended sequences or long-term dependencies. We present a
+distinct approach, Ring Attention, which leverages blockwise computation of
+self-attention to distribute long sequences across multiple devices while
+concurrently overlapping the communication of key-value blocks with the
+computation of blockwise attention. By processing longer input sequences while
+maintaining memory efficiency, Ring Attention enables training and inference of
+sequences that are device count times longer than those of prior
+memory-efficient Transformers, effectively eliminating the memory constraints
+imposed by individual devices. Extensive experiments on language modeling tasks
+demonstrate the effectiveness of Ring Attention in allowing large sequence
+input size and improving performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Role of Language Priors in Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01879v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01879v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiu Lin, Xinyue Chen, Deepak Pathak, Pengchuan Zhang, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) are impactful in part because they can be
+applied to a variety of visual understanding tasks in a zero-shot fashion,
+without any fine-tuning. We study $\textit{generative VLMs}$ that are trained
+for next-word generation given an image. We explore their zero-shot performance
+on the illustrative task of image-text retrieval across 8 popular
+vision-language benchmarks. Our first observation is that they can be
+repurposed for discriminative tasks (such as image-text retrieval) by simply
+computing the match score of generating a particular text string given an
+image. We call this probabilistic score the $\textit{Visual Generative
+Pre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces
+near-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on
+others. We analyze this behavior through a probabilistic lens, pointing out
+that some benchmarks inadvertently capture unnatural language distributions by
+creating adversarial but unlikely text captions. In fact, we demonstrate that
+even a "blind" language model that ignores any image evidence can sometimes
+outperform all prior art, reminiscent of similar challenges faced by the
+visual-question answering (VQA) community many years ago. We derive a
+probabilistic post-processing scheme that controls for the amount of linguistic
+bias in generative VLMs at test time without having to retrain or fine-tune the
+model. We show that the VisualGPTScore, when appropriately debiased, is a
+strong zero-shot baseline for vision-language understanding, oftentimes
+producing state-of-the-art accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://linzhiqiu.github.io/papers/visual_gpt_score/ Code:
+  https://github.com/linzhiqiu/visual_gpt_score/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.14883v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.14883v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenggui Li, Hongxin Liu, Zhengda Bian, Jiarui Fang, Haichen Huang, Yuliang Liu, Boxiang Wang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of Transformer models has pushed the deep learning model scale to
+billions of parameters. Due to the limited memory resource of a single GPU,
+However, the best practice for choosing the optimal parallel strategy is still
+lacking, since it requires domain expertise in both deep learning and parallel
+computing.
+  The Colossal-AI system addressed the above challenge by introducing a unified
+interface to scale your sequential code of model training to distributed
+environments. It supports parallel training methods such as data, pipeline,
+tensor, and sequence parallelism, as well as heterogeneous training methods
+integrated with zero redundancy optimizer. Compared to the baseline system,
+Colossal-AI can achieve up to 2.76 times training speedup on large-scale
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TADIS: Steering Models for Deep-Thinking about Demonstration Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00901v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00901v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianci Xue, Ziqi Wang, Yixia Li, Yun Chen, Guanhua Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning has been demonstrated that could significantly improve the
+zero-shot generalization capability to unseen tasks by an apparent margin. By
+incorporating additional context (e.g., task definition, examples) during the
+fine-tuning process, Large Language Models (LLMs) achieved much higher
+performance than before. However, recent work reported that delusive task
+examples can achieve almost the same performance as correct task examples,
+indicating the input-label correspondence is less important than previously
+thought. Intrigued by this counter-intuitive observation, we suspect models
+have the same illusion of competence as humans. Therefore, we propose a novel
+method called TADIS that steers LLMs for "Deep-Thinking'' about demonstration
+examples instead of merely seeing. To alleviate the illusion of competence of
+models, we first ask the model to verify the correctness of shown examples.
+Then, using the verification results as conditions to elicit models for a
+better answer. Our experimental results show that TADIS consistently
+outperforms competitive baselines on in-domain and out-domain tasks (improving
+2.79 and 4.03 average ROUGLE-L on out-domain and in-domain datasets,
+respectively). Despite the presence of generated examples (not all of the
+thinking labels are accurate), TADIS can notably enhance performance in
+zero-shot and few-shot settings. This also suggests that our approach can be
+adopted on a large scale to improve the instruction following capabilities of
+models without any manual labor. Moreover, we construct three types of thinking
+labels with different model sizes and find that small models learn from the
+format of TADIS but larger models can be steered for "Deep-Thinking''.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking the Evaluating Framework for Natural Language Understanding
+  in AI Systems: Language Acquisition as a Core for Future Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11981v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11981v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patricio Vera, Pedro Moya, Lisa Barraza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the burgeoning field of artificial intelligence (AI), the unprecedented
+progress of large language models (LLMs) in natural language processing (NLP)
+offers an opportunity to revisit the entire approach of traditional metrics of
+machine intelligence, both in form and content. As the realm of machine
+cognitive evaluation has already reached Imitation, the next step is an
+efficient Language Acquisition and Understanding. Our paper proposes a paradigm
+shift from the established Turing Test towards an all-embracing framework that
+hinges on language acquisition, taking inspiration from the recent advancements
+in LLMs. The present contribution is deeply tributary of the excellent work
+from various disciplines, point out the need to keep interdisciplinary bridges
+open, and delineates a more robust and sustainable approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 table, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnglE-optimized Text Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianming Li, Jing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-quality text embedding is pivotal in improving semantic textual
+similarity (STS) tasks, which are crucial components in Large Language Model
+(LLM) applications. However, a common challenge existing text embedding models
+face is the problem of vanishing gradients, primarily due to their reliance on
+the cosine function in the optimization objective, which has saturation zones.
+To address this issue, this paper proposes a novel angle-optimized text
+embedding model called AnglE. The core idea of AnglE is to introduce angle
+optimization in a complex space. This novel approach effectively mitigates the
+adverse effects of the saturation zone in the cosine function, which can impede
+gradient and hinder optimization processes. To set up a comprehensive STS
+evaluation, we experimented on existing short-text STS datasets and a newly
+collected long-text STS dataset from GitHub Issues. Furthermore, we examine
+domain-specific STS scenarios with limited labeled data and explore how AnglE
+works with LLM-annotated data. Extensive experiments were conducted on various
+tasks including short-text STS, long-text STS, and domain-specific STS tasks.
+The results show that AnglE outperforms the state-of-the-art (SOTA) STS models
+that ignore the cosine saturation zone. These findings demonstrate the ability
+of AnglE to generate high-quality text embeddings and the usefulness of angle
+optimization in STS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NLP, Text Embedding, Semantic Textual Similarity</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics of Language Models: Part 1, Context-Free Grammar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Allen-Zhu, Yuanzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We design controlled experiments to study HOW generative language models,
+like GPT, learn context-free grammars (CFGs) -- diverse language systems with a
+tree-like structure capturing many aspects of natural languages, programs, and
+logics. CFGs are as hard as pushdown automata, and can be ambiguous so that
+verifying if a string satisfies the rules requires dynamic programming. We
+construct synthetic data and demonstrate that even for difficult (long and
+ambiguous) CFGs, pre-trained transformers can learn to generate sentences with
+near-perfect accuracy and impressive diversity.
+  More importantly, we delve into the physical principles behind how
+transformers learns CFGs. We discover that the hidden states within the
+transformer implicitly and precisely encode the CFG structure (such as putting
+tree node information exactly on the subtree boundary), and learn to form
+"boundary to boundary" attentions resembling dynamic programming. We also cover
+some extension of CFGs as well as the robustness aspect of transformers against
+grammar mistakes. Overall, our research provides a comprehensive and empirical
+understanding of how transformers learn CFGs, and reveals the physical
+mechanisms utilized by transformers to capture the structure and rules of
+languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V2 polishes writing and adds Appendix G</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">104</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Baselines with Visual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Liu, Chunyuan Li, Yuheng Li, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal models (LMM) have recently shown encouraging progress with
+visual instruction tuning. In this note, we show that the fully-connected
+vision-language cross-modal connector in LLaVA is surprisingly powerful and
+data-efficient. With simple modifications to LLaVA, namely, using
+CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA
+data with simple response formatting prompts, we establish stronger baselines
+that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint
+uses merely 1.2M publicly available data, and finishes full training in ~1 day
+on a single 8-A100 node. We hope this can make state-of-the-art LMM research
+more accessible. Code and model will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech report, 4 pages. LLaVA project page: https://llava-vl.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ContactGen: Generative Contact Modeling for Grasp Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaowei Liu, Yang Zhou, Jimei Yang, Saurabh Gupta, Shenlong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel object-centric contact representation ContactGen
+for hand-object interaction. The ContactGen comprises three components: a
+contact map indicates the contact location, a part map represents the contact
+hand part, and a direction map tells the contact direction within each part.
+Given an input object, we propose a conditional generative model to predict
+ContactGen and adopt model-based optimization to predict diverse and
+geometrically feasible grasps. Experimental results demonstrate our method can
+generate high-fidelity and diverse human grasps for various objects. Project
+page: https://stevenlsw.github.io/contactgen/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Website:
+  https://stevenlsw.github.io/contactgen/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Text-to-Image Diffusion Models with Reward Backpropagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihir Prabhudesai, Anirudh Goyal, Deepak Pathak, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models have recently emerged at the forefront of
+image generation, powered by very large-scale unsupervised or weakly supervised
+text-to-image training datasets. Due to their unsupervised training,
+controlling their behavior in downstream tasks, such as maximizing
+human-perceived image quality, image-text alignment, or ethical image
+generation, is difficult. Recent works finetune diffusion models to downstream
+reward functions using vanilla reinforcement learning, notorious for the high
+variance of the gradient estimators. In this paper, we propose AlignProp, a
+method that aligns diffusion models to downstream reward functions using
+end-to-end backpropagation of the reward gradient through the denoising
+process. While naive implementation of such backpropagation would require
+prohibitive memory resources for storing the partial derivatives of modern
+text-to-image models, AlignProp finetunes low-rank adapter weight modules and
+uses gradient checkpointing, to render its memory usage viable. We test
+AlignProp in finetuning diffusion models to various objectives, such as
+image-text semantic alignment, aesthetics, compressibility and controllability
+of the number of objects present, as well as their combinations. We show
+AlignProp achieves higher rewards in fewer training steps than alternatives,
+while being conceptually simpler, making it a straightforward choice for
+optimizing diffusion models for differentiable reward functions of interest.
+Code and Visualization results are available at https://align-prop.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://align-prop.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stylist: Style-Driven Feature Ranking for Robust Novelty Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Smeu, Elena Burceanu, Emanuela Haller, Andrei Liviu Nicolicioiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novelty detection aims at finding samples that differ in some form from the
+distribution of seen samples. But not all changes are created equal. Data can
+suffer a multitude of distribution shifts, and we might want to detect only
+some types of relevant changes. Similar to works in out-of-distribution
+generalization, we propose to use the formalization of separating into semantic
+or content changes, that are relevant to our task, and style changes, that are
+irrelevant. Within this formalization, we define the robust novelty detection
+as the task of finding semantic changes while being robust to style
+distributional shifts. Leveraging pretrained, large-scale model
+representations, we introduce Stylist, a novel method that focuses on dropping
+environment-biased features. First, we compute a per-feature score based on the
+feature distribution distances between environments. Next, we show that our
+selection manages to remove features responsible for spurious correlations and
+improve novelty detection performance. For evaluation, we adapt domain
+generalization datasets to our task and analyze the methods behaviors. We
+additionally built a large synthetic dataset where we have control over the
+spurious correlations degree. We prove that our selection mechanism improves
+novelty detection algorithms across multiple datasets, containing both
+stylistic and content shifts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Unpaired Data for Vision-Language Generative Models via Cycle
+  Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianhong Li, Sangnie Bhardwaj, Yonglong Tian, Han Zhang, Jarred Barber, Dina Katabi, Guillaume Lajoie, Huiwen Chang, Dilip Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current vision-language generative models rely on expansive corpora of paired
+image-text data to attain optimal performance and generalization capabilities.
+However, automatically collecting such data (e.g. via large-scale web scraping)
+leads to low quality and poor image-text correlation, while human annotation is
+more accurate but requires significant manual effort and expense. We introduce
+$\textbf{ITIT}$ ($\textbf{I}$n$\textbf{T}$egrating $\textbf{I}$mage
+$\textbf{T}$ext): an innovative training paradigm grounded in the concept of
+cycle consistency which allows vision-language training on unpaired image and
+text data. ITIT is comprised of a joint image-text encoder with disjoint image
+and text decoders that enable bidirectional image-to-text and text-to-image
+generation in a single framework. During training, ITIT leverages a small set
+of paired image-text data to ensure its output matches the input reasonably
+well in both directions. Simultaneously, the model is also trained on much
+larger datasets containing only images or texts. This is achieved by enforcing
+cycle consistency between the original unpaired samples and the cycle-generated
+counterparts. For instance, it generates a caption for a given input image and
+then uses the caption to create an output image, and enforces similarity
+between the input and output images. Our experiments show that ITIT with
+unpaired datasets exhibits similar scaling behavior as using high-quality
+paired data. We demonstrate image generation and captioning performance on par
+with state-of-the-art text-to-image and image-to-text models with orders of
+magnitude fewer (only 3M) paired image-text data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathCoder: Seamless Code Integration in LLMs for Enhanced Mathematical
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Wang, Houxing Ren, Aojun Zhou, Zimu Lu, Sichun Luo, Weikang Shi, Renrui Zhang, Linqi Song, Mingjie Zhan, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently released GPT-4 Code Interpreter has demonstrated remarkable
+proficiency in solving challenging math problems, primarily attributed to its
+ability to seamlessly reason with natural language, generate code, execute
+code, and continue reasoning based on the execution output. In this paper, we
+present a method to fine-tune open-source language models, enabling them to use
+code for modeling and deriving math equations and, consequently, enhancing
+their mathematical reasoning abilities. We propose a method of generating novel
+and high-quality datasets with math problems and their code-based solutions,
+referred to as MathCodeInstruct. Each solution interleaves natural language,
+code, and execution results. We also introduce a customized supervised
+fine-tuning and inference approach. This approach yields the MathCoder models,
+a family of models capable of generating code-based solutions for solving
+challenging math problems. Impressively, the MathCoder models achieve
+state-of-the-art scores among open-source LLMs on the MATH (45.2%) and GSM8K
+(83.9%) datasets, substantially outperforming other open-source alternatives.
+Notably, the MathCoder model not only surpasses ChatGPT-3.5 and PaLM-2 on GSM8K
+and MATH but also outperforms GPT-4 on the competition-level MATH dataset. The
+dataset and models will be released at https://github.com/mathllm/MathCoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The state-of-the-art open-source language models for mathematical
+  reasoning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OMG-ATTACK: <span class="highlight-title">Self-Supervised</span> On-Manifold Generation of Transferable
+  Evasion Attacks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ofir Bar Tal, Adi Haviv, Amit H. Bermano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evasion Attacks (EA) are used to test the robustness of trained neural
+networks by distorting input data to misguide the model into incorrect
+classifications. Creating these attacks is a challenging task, especially with
+the ever-increasing complexity of models and datasets. In this work, we
+introduce a self-supervised, computationally economical method for generating
+adversarial examples, designed for the unseen black-box setting. Adapting
+techniques from representation learning, our method generates on-manifold EAs
+that are encouraged to resemble the data distribution. These attacks are
+comparable in effectiveness compared to the state-of-the-art when attacking the
+model trained on, but are significantly more effective when attacking unseen
+models, as the attacks are more related to the data rather than the model
+itself. Our experiments consistently demonstrate the method is effective across
+various models, unseen data categories, and even defended models, suggesting a
+significant role for on-manifold EAs when targeting unseen models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, AROW Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drag View: Generalizable Novel View Synthesis with Unposed Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwen Fan, Panwang Pan, Peihao Wang, Yifan Jiang, Hanwen Jiang, Dejia Xu, Zehao Zhu, Dilin Wang, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce DragView, a novel and interactive framework for generating novel
+views of unseen scenes. DragView initializes the new view from a single source
+image, and the rendering is supported by a sparse set of unposed multi-view
+images, all seamlessly executed within a single feed-forward pass. Our approach
+begins with users dragging a source view through a local relative coordinate
+system. Pixel-aligned features are obtained by projecting the sampled 3D points
+along the target ray onto the source view. We then incorporate a view-dependent
+modulation layer to effectively handle occlusion during the projection.
+Additionally, we broaden the epipolar attention mechanism to encompass all
+source pixels, facilitating the aggregation of initialized coordinate-aligned
+point features from other unposed views. Finally, we employ another transformer
+to decode ray features into final pixel intensities. Crucially, our framework
+does not rely on either 2D prior models or the explicit estimation of camera
+poses. During testing, DragView showcases the capability to generalize to new
+scenes unseen during training, also utilizing only unposed support images,
+enabling the generation of photo-realistic new views characterized by flexible
+camera trajectories. In our experiments, we conduct a comprehensive comparison
+of the performance of DragView with recent scene representation networks
+operating under pose-free conditions, as well as with generalizable NeRFs
+subject to noisy test camera poses. DragView consistently demonstrates its
+superior performance in view synthesis quality, while also being more
+user-friendly. Project page: https://zhiwenfan.github.io/DragView/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LumiNet: The Bright Side of Perceptual Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Ismail Hossain, M M Lutfe Elahi, Sameera Ramasinghe, Ali Cheraghian, Fuad Rahman, Nabeel Mohammed, Shafin Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In knowledge distillation research, feature-based methods have dominated due
+to their ability to effectively tap into extensive teacher models. In contrast,
+logit-based approaches are considered to be less adept at extracting hidden
+'dark knowledge' from teachers. To bridge this gap, we present LumiNet, a novel
+knowledge-transfer algorithm designed to enhance logit-based distillation. We
+introduce a perception matrix that aims to recalibrate logits through
+adjustments based on the model's representation capability. By meticulously
+analyzing intra-class dynamics, LumiNet reconstructs more granular inter-class
+relationships, enabling the student model to learn a richer breadth of
+knowledge. Both teacher and student models are mapped onto this refined matrix,
+with the student's goal being to minimize representational discrepancies.
+Rigorous testing on benchmark datasets (CIFAR-100, ImageNet, and MSCOCO)
+attests to LumiNet's efficacy, revealing its competitive edge over leading
+feature-based methods. Moreover, in exploring the realm of transfer learning,
+we assess how effectively the student model, trained using our method, adapts
+to downstream tasks. Notably, when applied to Tiny ImageNet, the transferred
+features exhibit remarkable performance, further underscoring LumiNet's
+versatility and robustness in diverse settings. With LumiNet, we hope to steer
+the research discourse towards a renewed interest in the latent capabilities of
+logit-based knowledge distillation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Certification of Deep Learning Models for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Othmane Laousy, Alexandre Araujo, Guillaume Chassagnon, Nikos Paragios, Marie-Pierre Revel, Maria Vakalopoulou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In medical imaging, segmentation models have known a significant improvement
+in the past decade and are now used daily in clinical practice. However,
+similar to classification models, segmentation models are affected by
+adversarial attacks. In a safety-critical field like healthcare, certifying
+model predictions is of the utmost importance. Randomized smoothing has been
+introduced lately and provides a framework to certify models and obtain
+theoretical guarantees. In this paper, we present for the first time a
+certified segmentation baseline for medical imaging based on randomized
+smoothing and diffusion models. Our results show that leveraging the power of
+denoising diffusion probabilistic models helps us overcome the limits of
+randomized smoothing. We conduct extensive experiments on five public datasets
+of chest X-rays, skin lesions, and colonoscopies, and empirically show that we
+are able to maintain high certified Dice scores even for highly perturbed
+images. Our work represents the first attempt to certify medical image
+segmentation models, and we aspire for it to set a foundation for future
+benchmarks in this crucial and largely uncharted area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness-Guided Image Synthesis for Data-Free Quantization <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhong Bai, Yuchen Yang, Huanpeng Chu, Hualiang Wang, Zuozhu Liu, Ruizhe Chen, Xiaoxuan He, Lianrui Mu, Chengfei Cai, Haoji Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has emerged as a promising direction for model compression.
+Recently, data-free quantization has been widely studied as a promising method
+to avoid privacy concerns, which synthesizes images as an alternative to real
+training data. Existing methods use classification loss to ensure the
+reliability of the synthesized images. Unfortunately, even if these images are
+well-classified by the pre-trained model, they still suffer from low semantics
+and homogenization issues. Intuitively, these low-semantic images are sensitive
+to perturbations, and the pre-trained model tends to have inconsistent output
+when the generator synthesizes an image with poor semantics. To this end, we
+propose Robustness-Guided Image Synthesis (RIS), a simple but effective method
+to enrich the semantics of synthetic images and improve image diversity,
+further boosting the performance of downstream data-free compression tasks.
+Concretely, we first introduce perturbations on input and model weight, then
+define the inconsistency metrics at feature and prediction levels before and
+after perturbations. On the basis of inconsistency on two levels, we design a
+robustness optimization objective to enhance the semantics of synthetic images.
+Moreover, we also make our approach diversity-aware by forcing the generator to
+synthesize images with small correlations in the label space. With RIS, we
+achieve state-of-the-art performance for various settings on data-free
+quantization and can be extended to other data-free compression tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual inspection for illicit items in X-ray images using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Mademlis, Georgios Batsis, Adamantia Anna Rebolledo Chrysochoou, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated detection of contraband items in X-ray images can significantly
+increase public safety, by enhancing the productivity and alleviating the
+mental load of security officers in airports, subways, customs/post offices,
+etc. The large volume and high throughput of passengers, mailed parcels, etc.,
+during rush hours practically make it a Big Data problem. Modern computer
+vision algorithms relying on Deep Neural Networks (DNNs) have proven capable of
+undertaking this task even under resource-constrained and embedded execution
+scenarios, e.g., as is the case with fast, single-stage object detectors.
+However, no comparative experimental assessment of the various relevant DNN
+components/methods has been performed under a common evaluation protocol, which
+means that reliable cross-method comparisons are missing. This paper presents
+exactly such a comparative assessment, utilizing a public relevant dataset and
+a well-defined methodology for selecting the specific DNN components/modules
+that are being evaluated. The results indicate the superiority of Transformer
+detectors, the obsolete nature of auxiliary neural modules that have been
+developed in the past few years for security applications and the efficiency of
+the CSP-DarkNet backbone CNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLEVRER-Humans: Describing Physical and Causal Events the Human Way <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayuan Mao, Xuelin Yang, Xikun Zhang, Noah D. Goodman, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building machines that can reason about physical events and their causal
+relationships is crucial for flexible interaction with the physical world.
+However, most existing physical and causal reasoning benchmarks are exclusively
+based on synthetically generated events and synthetic natural language
+descriptions of causal relationships. This design brings up two issues. First,
+there is a lack of diversity in both event types and natural language
+descriptions; second, causal relationships based on manually-defined heuristics
+are different from human judgments. To address both shortcomings, we present
+the CLEVRER-Humans benchmark, a video reasoning dataset for causal judgment of
+physical events with human labels. We employ two techniques to improve data
+collection efficiency: first, a novel iterative event cloze task to elicit a
+new representation of events in videos, which we term Causal Event Graphs
+(CEGs); second, a data augmentation technique based on neural language
+generative models. We convert the collected CEGs into questions and answers to
+be consistent with prior work. Finally, we study a collection of baseline
+approaches for CLEVRER-Humans question-answering, highlighting the great
+challenges set forth by our benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 (Dataset and Benchmark Track). First two authors
+  contributed equally. Project page:
+  https://sites.google.com/stanford.edu/clevrer-humans/home</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wasserstein Distortion: Unifying Fidelity and Realism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Qiu, Aaron B. Wagner, Johannes Ballé, Lucas Theis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a distortion measure for images, Wasserstein distortion, that
+simultaneously generalizes pixel-level fidelity on the one hand and realism on
+the other. We show how Wasserstein distortion reduces mathematically to a pure
+fidelity constraint or a pure realism constraint under different parameter
+choices. Pairs of images that are close under Wasserstein distortion illustrate
+its utility. In particular, we generate random textures that have high fidelity
+to a reference texture in one location of the image and smoothly transition to
+an independent realization of the texture as one moves away from this point.
+Connections between Wasserstein distortion and models of the human visual
+system are noted.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling
+  and Motion Planning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Schulze, Hod Lipson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A robot self-model is a task-agnostic representation of the robot's physical
+morphology that can be used for motion planning tasks in absence of classical
+geometric kinematic models. In particular, when the latter are hard to engineer
+or the robot's kinematics change unexpectedly, human-free self-modeling is a
+necessary feature of truly autonomous agents. In this work, we leverage neural
+fields to allow a robot to self-model its kinematics as a neural-implicit query
+model learned only from 2D images annotated with camera poses and
+configurations. This enables significantly greater applicability than existing
+approaches which have been dependent on depth images or geometry knowledge. To
+this end, alongside a curricular data sampling strategy, we propose a new
+encoder-based neural density field architecture for dynamic object-centric
+scenes conditioned on high numbers of degrees of freedom (DOFs). In a 7-DOF
+robot test setup, the learned self-model achieves a Chamfer-L2 distance of 2%
+of the robot's workspace dimension. We demonstrate the capabilities of this
+model on a motion planning task as an exemplary downstream application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Workshop on Neural Fields for Autonomous Driving and
+  Robotics (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Animatable Virtual Humans: Learning pose-dependent human representations
+  in UV space for interactive performance synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03615v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03615v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wieland Morgenstern, Milena T. Bagdasarian, Anna Hilsmann, Peter Eisert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel representation of virtual humans for highly realistic
+real-time animation and rendering in 3D applications. We learn pose dependent
+appearance and geometry from highly accurate dynamic mesh sequences obtained
+from state-of-the-art multiview-video reconstruction. Learning pose-dependent
+appearance and geometry from mesh sequences poses significant challenges, as it
+requires the network to learn the intricate shape and articulated motion of a
+human body. However, statistical body models like SMPL provide valuable
+a-priori knowledge which we leverage in order to constrain the dimension of the
+search space enabling more efficient and targeted learning and define
+pose-dependency. Instead of directly learning absolute pose-dependent geometry,
+we learn the difference between the observed geometry and the fitted SMPL
+model. This allows us to encode both pose-dependent appearance and geometry in
+the consistent UV space of the SMPL model. This approach not only ensures a
+high level of realism but also facilitates streamlined processing and rendering
+of virtual humans in real-time scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good Are Synthetic Medical Images? An Empirical Study with Lung
+  Ultrasound 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menghan Yu, Sourabh Kulhare, Courosh Mehanian, Charles B Delahunt, Daniel E Shea, Zohreh Laverriere, Ishan Shah, Matthew P Horning
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring large quantities of data and annotations is known to be effective
+for developing high-performing deep learning models, but is difficult and
+expensive to do in the healthcare context. Adding synthetic training data using
+generative models offers a low-cost method to deal effectively with the data
+scarcity challenge, and can also address data imbalance and patient privacy
+issues. In this study, we propose a comprehensive framework that fits
+seamlessly into model development workflows for medical image analysis. We
+demonstrate, with datasets of varying size, (i) the benefits of generative
+models as a data augmentation method; (ii) how adversarial methods can protect
+patient privacy via data substitution; (iii) novel performance metrics for
+these use cases by testing models on real holdout data. We show that training
+with both synthetic and real data outperforms training with real data alone,
+and that models trained solely with synthetic data approach their real-only
+counterparts. Code is available at
+https://github.com/Global-Health-Labs/US-DCGAN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted in Simulation and Synthesis in Medical Imaging (SASHIMI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ctrl-Room: Controllable Text-to-3D Room Meshes Generation with Layout
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03602v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03602v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan Fang, Xiaotao Hu, Kunming Luo, Ping Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven 3D indoor scene generation could be useful for gaming, film
+industry, and AR/VR applications. However, existing methods cannot faithfully
+capture the room layout, nor do they allow flexible editing of individual
+objects in the room. To address these problems, we present Ctrl-Room, which is
+able to generate convincing 3D rooms with designer-style layouts and
+high-fidelity textures from just a text prompt. Moreover, Ctrl-Room enables
+versatile interactive editing operations such as resizing or moving individual
+furniture items. Our key insight is to separate the modeling of layouts and
+appearance. %how to model the room that takes into account both scene texture
+and geometry at the same time. To this end, Our proposed method consists of two
+stages, a `Layout Generation Stage' and an `Appearance Generation Stage'. The
+`Layout Generation Stage' trains a text-conditional diffusion model to learn
+the layout distribution with our holistic scene code parameterization. Next,
+the `Appearance Generation Stage' employs a fine-tuned ControlNet to produce a
+vivid panoramic image of the room guided by the 3D scene layout and text
+prompt. In this way, we achieve a high-quality 3D room with convincing layouts
+and lively textures. Benefiting from the scene code parameterization, we can
+easily edit the generated room model through our mask-guided editing module,
+without expensive editing-specific training. Extensive experiments on the
+Structured3D dataset demonstrate that our method outperforms existing methods
+in producing more reasonable, view-consistent, and editable 3D rooms from
+natural language prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance
+  Fields <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ágoston István Csehi, Csaba Máté Józsa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to improve the Inverted Neural Radiance Fields (iNeRF) algorithm which
+defines the image pose estimation problem as a NeRF based iterative linear
+optimization. NeRFs are novel neural space representation models that can
+synthesize photorealistic novel views of real-world scenes or objects. Our
+contributions are as follows: we extend the localization optimization objective
+with a depth-based loss function, we introduce a multi-image based loss
+function where a sequence of images with known relative poses are used without
+increasing the computational complexity, we omit hierarchical sampling during
+volumetric rendering, meaning only the coarse model is used for pose
+estimation, and we how that by extending the sampling interval convergence can
+be achieved even or higher initial pose estimate errors. With the proposed
+modifications the convergence speed is significantly improved, and the basin of
+convergence is substantially extended.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Nerf4ADR workshop of ICCV23 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedSynV1: Text-guided Anatomy-aware Synthesis of High-Fidelity 3D CT
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanwu Xu, Li Sun, Wei Peng, Shyam Visweswaran, Kayhan Batmanghelich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an innovative methodology for producing high-quality 3D
+lung CT images guided by textual information. While diffusion-based generative
+models are increasingly used in medical imaging, current state-of-the-art
+approaches are limited to low-resolution outputs and underutilize radiology
+reports' abundant information. The radiology reports can enhance the generation
+process by providing additional guidance and offering fine-grained control over
+the synthesis of images. Nevertheless, expanding text-guided generation to
+high-resolution 3D images poses significant memory and anatomical
+detail-preserving challenges. Addressing the memory issue, we introduce a
+hierarchical scheme that uses a modified UNet architecture. We start by
+synthesizing low-resolution images conditioned on the text, serving as a
+foundation for subsequent generators for complete volumetric data. To ensure
+the anatomical plausibility of the generated samples, we provide further
+guidance by generating vascular, airway, and lobular segmentation masks in
+conjunction with the CT images. The model demonstrates the capability to use
+textual input and segmentation tasks to generate synthesized images. The
+results of comparative assessments indicate that our approach exhibits superior
+performance compared to the most advanced models based on GAN and diffusion
+techniques, especially in accurately retaining crucial anatomical features such
+as fissure lines, airways, and vascular structures. This innovation introduces
+novel possibilities. This study focuses on two main objectives: (1) the
+development of a method for creating images based on textual prompts and
+anatomical components, and (2) the capability to generate new images
+conditioning on anatomical elements. The advancements in image generation can
+be applied to enhance numerous downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Unified Deep Image Deraining: A <span class="highlight-title">Survey</span> and A New Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Jinshan Pan, Jiangxin Dong, Jinhui Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed significant advances in image deraining due to
+the kinds of effective image priors and deep learning models. As each deraining
+approach has individual settings (e.g., training and test datasets, evaluation
+criteria), how to fairly evaluate existing approaches comprehensively is not a
+trivial task. Although existing surveys aim to review of image deraining
+approaches comprehensively, few of them focus on providing unify evaluation
+settings to examine the deraining capability and practicality evaluation. In
+this paper, we provide a comprehensive review of existing image deraining
+method and provide a unify evaluation setting to evaluate the performance of
+image deraining methods. We construct a new high-quality benchmark named
+HQ-RAIN to further conduct extensive evaluation, consisting of 5,000 paired
+high-resolution synthetic images with higher harmony and realism. We also
+discuss the existing challenges and highlight several future research
+opportunities worth exploring. To facilitate the reproduction and tracking of
+the latest deraining technologies for general users, we build an online
+platform to provide the off-the-shelf toolkit, involving the large-scale
+performance evaluation. This online platform and the proposed new benchmark are
+publicly available and will be regularly updated at http://www.deraining.tech/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: http://www.deraining.tech/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-Aware Hypothesis & Verification for Generalizable Relative Object
+  Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhao, Tong Zhang, Mathieu Salzmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior methods that tackle the problem of generalizable object pose estimation
+highly rely on having dense views of the unseen object. By contrast, we address
+the scenario where only a single reference view of the object is available. Our
+goal then is to estimate the relative object pose between this reference view
+and a query image that depicts the object in a different pose. In this
+scenario, robust generalization is imperative due to the presence of unseen
+objects during testing and the large-scale object pose variation between the
+reference and the query. To this end, we present a new
+hypothesis-and-verification framework, in which we generate and evaluate
+multiple pose hypotheses, ultimately selecting the most reliable one as the
+relative object pose. To measure reliability, we introduce a 3D-aware
+verification that explicitly applies 3D transformations to the 3D object
+representations learned from the two input images. Our comprehensive
+experiments on the Objaverse, LINEMOD, and CO3D datasets evidence the superior
+accuracy of our approach in relative pose estimation and its robustness in
+large-scale pose variations, when dealing with unseen objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ V2X Cooperative Perception for Autonomous Driving: Recent Advances and
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Huang, Jianan Liu, Xi Zhou, Dinh C. Nguyen, Mostafa Rahimi Azghadi, Yuxuan Xia, Qing-Long Han, Sumei Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate perception is essential for advancing autonomous driving and
+addressing safety challenges in modern transportation systems. Despite
+significant advancements in computer vision for object recognition, current
+perception methods still face difficulties in complex real-world traffic
+environments. Challenges such as physical occlusion and limited sensor field of
+view persist for individual vehicle systems. Cooperative Perception (CP) with
+Vehicle-to-Everything (V2X) technologies has emerged as a solution to overcome
+these obstacles and enhance driving automation systems. While some research has
+explored CP's fundamental architecture and critical components, there remains a
+lack of comprehensive summaries of the latest innovations, particularly in the
+context of V2X communication technologies. To address this gap, this paper
+provides a comprehensive overview of the evolution of CP technologies, spanning
+from early explorations to recent developments, including advancements in V2X
+communication technologies. Additionally, a contemporary generic framework is
+proposed to illustrate the V2X-based CP workflow, aiding in the structured
+understanding of CP system components. Furthermore, this paper categorizes
+prevailing V2X-based CP methodologies based on the critical issues they
+address. An extensive literature review is conducted within this taxonomy,
+evaluating existing datasets and simulators. Finally, open challenges and
+future directions in CP for autonomous driving are discussed by considering
+both perception and V2X communication advancements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PrototypeFormer: Learning to Explore Prototype Relationships for
+  Few-shot Image Classification <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feihong He, Gang Li, Lingyu Si, Leilei Yan, Fanzhang Li, Fuchun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot image classification has received considerable attention for
+addressing the challenge of poor classification performance with limited
+samples in novel classes. However, numerous studies have employed sophisticated
+learning strategies and diversified feature extraction methods to address this
+issue. In this paper, we propose our method called PrototypeFormer, which aims
+to significantly advance traditional few-shot image classification approaches
+by exploring prototype relationships. Specifically, we utilize a transformer
+architecture to build a prototype extraction module, aiming to extract class
+representations that are more discriminative for few-shot classification.
+Additionally, during the model training process, we propose a contrastive
+learning-based optimization approach to optimize prototype features in few-shot
+learning scenarios. Despite its simplicity, the method performs remarkably
+well, with no bells and whistles. We have experimented with our approach on
+several popular few-shot image classification benchmark datasets, which shows
+that our method outperforms all current state-of-the-art methods. In
+particular, our method achieves 97.07% and 90.88% on 5-way 5-shot and 5-way
+1-shot tasks of miniImageNet, which surpasses the state-of-the-art results with
+accuracy of 7.27% and 8.72%, respectively. The code will be released later.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring DINO: Emergent Properties and Limitations for Synthetic
+  Aperture Radar Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph A. Gallego-Mejia, Anna Jungbluth, Laura Martínez-Ferrer, Matt Allen, Francisco Dorr, Freddie Kalaitzis, Raúl Ramos-Pollán
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) models have recently demonstrated remarkable
+performance across various tasks, including image segmentation. This study
+delves into the emergent characteristics of the Self-Distillation with No
+Labels (DINO) algorithm and its application to Synthetic Aperture Radar (SAR)
+imagery. We pre-train a vision transformer (ViT)-based DINO model using
+unlabeled SAR data, and later fine-tune the model to predict high-resolution
+land cover maps. We rigorously evaluate the utility of attention maps generated
+by the ViT backbone, and compare them with the model's token embedding space.
+We observe a small improvement in model performance with pre-training compared
+to training from scratch, and discuss the limitations and opportunities of SSL
+for remote sensing and land cover segmentation. Beyond small performance
+increases, we show that ViT attention maps hold great intrinsic value for
+remote sensing, and could provide useful inputs to other algorithms. With this,
+our work lays the ground-work for bigger and better SSL models for Earth
+Observation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RL-based Stateful Neural Adaptive Sampling and Denoising for Real-Time
+  Path Tracing <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Scardigli, Lukas Cavigelli, Lorenz K. Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monte-Carlo path tracing is a powerful technique for realistic image
+synthesis but suffers from high levels of noise at low sample counts, limiting
+its use in real-time applications. To address this, we propose a framework with
+end-to-end training of a sampling importance network, a latent space encoder
+network, and a denoiser network. Our approach uses reinforcement learning to
+optimize the sampling importance network, thus avoiding explicit numerically
+approximated gradients. Our method does not aggregate the sampled values per
+pixel by averaging but keeps all sampled values which are then fed into the
+latent space encoder. The encoder replaces handcrafted spatiotemporal
+heuristics by learned representations in a latent space. Finally, a neural
+denoiser is trained to refine the output image. Our approach increases visual
+quality on several challenging datasets and reduces rendering times for equal
+quality by a factor of 1.6x compared to the previous state-of-the-art, making
+it a promising solution for real-time applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS. https://openreview.net/forum?id=xNyR7DXUzJ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kandinsky: an Improved Text-to-Image Synthesis with Image Prior and
+  Latent Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Razzhigaev, Arseniy Shakhmatov, Anastasia Maltseva, Vladimir Arkhipkin, Igor Pavlov, Ilya Ryabov, Angelina Kuts, Alexander Panchenko, Andrey Kuznetsov, Denis Dimitrov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generation is a significant domain in modern computer vision
+and has achieved substantial improvements through the evolution of generative
+architectures. Among these, there are diffusion-based models that have
+demonstrated essential quality enhancements. These models are generally split
+into two categories: pixel-level and latent-level approaches. We present
+Kandinsky1, a novel exploration of latent diffusion architecture, combining the
+principles of the image prior models with latent diffusion techniques. The
+image prior model is trained separately to map text embeddings to image
+embeddings of CLIP. Another distinct feature of the proposed model is the
+modified MoVQ implementation, which serves as the image autoencoder component.
+Overall, the designed model contains 3.3B parameters. We also deployed a
+user-friendly demo system that supports diverse generative modes such as
+text-to-image generation, image fusion, text and image fusion, image variations
+generation, and text-guided inpainting/outpainting. Additionally, we released
+the source code and checkpoints for the Kandinsky models. Experimental
+evaluations demonstrate a FID score of 8.03 on the COCO-30K dataset, marking
+our model as the top open-source performer in terms of measurable image
+generation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IceCloudNet: Cirrus and mixed-phase cloud prediction from SEVIRI input
+  learned from sparse supervision <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Jeggle, Mikolaj Czerkawski, Federico Serva, Bertrand Le Saux, David Neubauer, Ulrike Lohmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clouds containing ice particles play a crucial role in the climate system.
+Yet they remain a source of great uncertainty in climate models and future
+climate projections. In this work, we create a new observational constraint of
+regime-dependent ice microphysical properties at the spatio-temporal coverage
+of geostationary satellite instruments and the quality of active satellite
+retrievals. We achieve this by training a convolutional neural network on three
+years of SEVIRI and DARDAR data sets. This work will enable novel research to
+improve ice cloud process understanding and hence, reduce uncertainties in a
+changing climate and help assess geoengineering methods for cirrus clouds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Preprint. Submitted to Tackling Climate Change with Machine
+  Learning: workshop at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Kollias, Karanjot Vendal, Priyanka Gadhavi, Solomon Russom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors pose significant health challenges worldwide, with glioblastoma
+being one of the most aggressive forms. Accurate determination of the
+O6-methylguanine-DNA methyltransferase (MGMT) promoter methylation status is
+crucial for personalized treatment strategies. However, traditional methods are
+labor-intensive and time-consuming. This paper proposes a novel multi-modal
+approach, BTDNet, leveraging multi-parametric MRI scans, including FLAIR, T1w,
+T1wCE, and T2 3D volumes, to predict MGMT promoter methylation status. BTDNet
+addresses two main challenges: the variable volume lengths (i.e., each volume
+consists of a different number of slices) and the volume-level annotations
+(i.e., the whole 3D volume is annotated and not the independent slices that it
+consists of). BTDNet consists of four components: i) the data augmentation one
+(that performs geometric transformations, convex combinations of data pairs and
+test-time data augmentation); ii) the 3D analysis one (that performs global
+analysis through a CNN-RNN); iii) the routing one (that contains a mask layer
+that handles variable input feature lengths), and iv) the modality fusion one
+(that effectively enhances data representation, reduces ambiguities and
+mitigates data scarcity). The proposed method outperforms by large margins the
+state-of-the-art methods in the RSNA-ASNR-MICCAI BraTS 2021 Challenge, offering
+a promising avenue for enhancing brain tumor diagnosis and treatment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ammonia-Net: A Multi-task Joint Learning Model for Multi-class
+  Segmentation and Classification in Tooth-marked Tongue Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunkai Shi, Yuqi Wang, Qihui Ye, Yanran Wang, Yiming Zhu, Muhammad Hassan, Aikaterini Melliou, Dongmei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Traditional Chinese Medicine, the tooth marks on the tongue, stemming from
+prolonged dental pressure, serve as a crucial indicator for assessing qi (yang)
+deficiency, which is intrinsically linked to visceral health. Manual diagnosis
+of tooth-marked tongue solely relies on experience. Nonetheless, the diversity
+in shape, color, and type of tooth marks poses a challenge to diagnostic
+accuracy and consistency. To address these problems, herein we propose a
+multi-task joint learning model named Ammonia-Net. This model employs a
+convolutional neural network-based architecture, specifically designed for
+multi-class segmentation and classification of tongue images. Ammonia-Net
+performs semantic segmentation of tongue images to identify tongue and tooth
+marks. With the assistance of segmentation output, it classifies the images
+into the desired number of classes: healthy tongue, light tongue, moderate
+tongue, and severe tongue. As far as we know, this is the first attempt to
+apply the semantic segmentation results of tooth marks for tooth-marked tongue
+classification. To train Ammonia-Net, we collect 856 tongue images from 856
+subjects. After a number of extensive experiments, the experimental results
+show that the proposed model achieves 99.06% accuracy in the two-class
+classification task of tooth-marked tongue identification and 80.02%. As for
+the segmentation task, mIoU for tongue and tooth marks amounts to 71.65%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Resolution Audio-Visual Feature Fusion for Temporal Action
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Fish, Jon Weinbren, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Action Localization (TAL) aims to identify actions' start, end, and
+class labels in untrimmed videos. While recent advancements using transformer
+networks and Feature Pyramid Networks (FPN) have enhanced visual feature
+recognition in TAL tasks, less progress has been made in the integration of
+audio features into such frameworks. This paper introduces the Multi-Resolution
+Audio-Visual Feature Fusion (MRAV-FF), an innovative method to merge
+audio-visual data across different temporal resolutions. Central to our
+approach is a hierarchical gated cross-attention mechanism, which discerningly
+weighs the importance of audio information at diverse temporal scales. Such a
+technique not only refines the precision of regression boundaries but also
+bolsters classification confidence. Importantly, MRAV-FF is versatile, making
+it compatible with existing FPN TAL architectures and offering a significant
+enhancement in performance when audio data is available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating the Influence of Domain Shift in Skin Lesion Classification:
+  A Benchmark Study of Unsupervised Domain Adaptation Methods on Dermoscopic
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sireesha Chamarthi, Katharina Fogelberg, Roman C. Maron, Titus J. Brinker, Julia Niebling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The potential of deep neural networks in skin lesion classification has
+already been demonstrated to be on-par if not superior to the dermatologists
+diagnosis. However, the performance of these models usually deteriorates when
+the test data differs significantly from the training data (i.e. domain shift).
+This concerning limitation for models intended to be used in real-world skin
+lesion classification tasks poses a risk to patients. For example, different
+image acquisition systems or previously unseen anatomical sites on the patient
+can suffice to cause such domain shifts. Mitigating the negative effect of such
+shifts is therefore crucial, but developing effective methods to address domain
+shift has proven to be challenging. In this study, we carry out an in-depth
+analysis of eight different unsupervised domain adaptation methods to analyze
+their effectiveness in improving generalization for dermoscopic datasets. To
+ensure robustness of our findings, we test each method on a total of ten
+distinct datasets, thereby covering a variety of possible domain shifts. In
+addition, we investigated which factors in the domain shifted datasets have an
+impact on the effectiveness of domain adaptation methods. Our findings show
+that all of the eight domain adaptation methods result in improved AUPRC for
+the majority of analyzed datasets. Altogether, these results indicate that
+unsupervised domain adaptations generally lead to performance improvements for
+the binary melanoma-nevus classification task regardless of the nature of the
+domain shift. However, small or heavily imbalanced datasets lead to a reduced
+conformity of the results due to the influence of these factors on the methods
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Zero Level-Set Extraction from Unsigned Distance Fields Based on
+  Double Covering <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Hou, Xuhui Chen, Wencheng Wang, Hong Qin, Ying He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new method, called DoubleCoverUDF, for extracting
+the zero level-set from unsigned distance fields (UDFs). DoubleCoverUDF takes a
+learned UDF and a user-specified parameter $r$ (a small positive real number)
+as input and extracts an iso-surface with an iso-value $r$ using the
+conventional marching cubes algorithm. We show that the computed iso-surface is
+the boundary of the $r$-offset volume of the target zero level-set $S$, which
+is an orientable manifold, regardless of the topology of $S$. Next, the
+algorithm computes a covering map to project the boundary mesh onto $S$,
+preserving the mesh's topology and avoiding folding. If $S$ is an orientable
+manifold surface, our algorithm separates the double-layered mesh into a single
+layer using a robust minimum-cut post-processing step. Otherwise, it keeps the
+double-layered mesh as the output. We validate our algorithm by reconstructing
+3D surfaces of open models and demonstrate its efficacy and effectiveness on
+synthetic models and benchmark datasets. Our experimental results confirm that
+our method is robust and produces meshes with better quality in terms of both
+visual evaluation and quantitative measures than existing UDF-based methods.
+The source code is available at https://github.com/jjjkkyz/DCUDF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ACM Transactions on Graphics (SIGGRAPH Asia 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FreeReg: Image-to-Point Cloud Registration Leveraging <span class="highlight-title">Pretrain</span>ed
+  Diffusion Models and Monocular Depth Estimators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiping Wang, Yuan Liu, Bing Wang, Yujing Sun, Zhen Dong, Wenping Wang, Bisheng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matching cross-modality features between images and point clouds is a
+fundamental problem for image-to-point cloud registration. However, due to the
+modality difference between images and points, it is difficult to learn robust
+and discriminative cross-modality features by existing metric learning methods
+for feature matching. Instead of applying metric learning on cross-modality
+data, we propose to unify the modality between images and point clouds by
+pretrained large-scale models first, and then establish robust correspondence
+within the same modality. We show that the intermediate features, called
+diffusion features, extracted by depth-to-image diffusion models are
+semantically consistent between images and point clouds, which enables the
+building of coarse but robust cross-modality correspondences. We further
+extract geometric features on depth maps produced by the monocular depth
+estimator. By matching such geometric features, we significantly improve the
+accuracy of the coarse correspondences produced by diffusion features.
+Extensive experiments demonstrate that without any task-specific training,
+direct utilization of both features produces accurate image-to-point cloud
+registration. On three public indoor and outdoor benchmarks, the proposed
+method averagely achieves a 20.6 percent improvement in Inlier Ratio, a
+three-fold higher Inlier Number, and a 48.6 percent improvement in Registration
+Recall than existing state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://whu-usi3dv.github.io/FreeReg/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Complementary Global and Local Knowledge Network for Ultrasound
+  denoising with Fine-grained Refinement <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Bu, Kai-Ni Wang, Fuxing Zhao, Shengxiao Li, Guang-Quan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound imaging serves as an effective and non-invasive diagnostic tool
+commonly employed in clinical examinations. However, the presence of speckle
+noise in ultrasound images invariably degrades image quality, impeding the
+performance of subsequent tasks, such as segmentation and classification.
+Existing methods for speckle noise reduction frequently induce excessive image
+smoothing or fail to preserve detailed information adequately. In this paper,
+we propose a complementary global and local knowledge network for ultrasound
+denoising with fine-grained refinement. Initially, the proposed architecture
+employs the L-CSwinTransformer as encoder to capture global information,
+incorporating CNN as decoder to fuse local features. We expand the resolution
+of the feature at different stages to extract more global information compared
+to the original CSwinTransformer. Subsequently, we integrate Fine-grained
+Refinement Block (FRB) within the skip-connection stage to further augment
+features. We validate our model on two public datasets, HC18 and BUSI.
+Experimental results demonstrate that our model can achieve competitive
+performance in both quantitative metrics and visual performance. Our code will
+be available at https://github.com/AAlkaid/USDenoising.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Simplify Spatial-Temporal Graphs in Gait Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Cosma, Emilian Radoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait analysis leverages unique walking patterns for person identification and
+assessment across multiple domains. Among the methods used for gait analysis,
+skeleton-based approaches have shown promise due to their robust and
+interpretable features. However, these methods often rely on hand-crafted
+spatial-temporal graphs that are based on human anatomy disregarding the
+particularities of the dataset and task. This paper proposes a novel method to
+simplify the spatial-temporal graph representation for gait-based gender
+estimation, improving interpretability without losing performance. Our approach
+employs two models, an upstream and a downstream model, that can adjust the
+adjacency matrix for each walking instance, thereby removing the fixed nature
+of the graph. By employing the Straight-Through Gumbel-Softmax trick, our model
+is trainable end-to-end. We demonstrate the effectiveness of our approach on
+the CASIA-B dataset for gait-based gender estimation. The resulting graphs are
+interpretable and differ qualitatively from fixed graphs used in existing
+models. Our research contributes to enhancing the explainability and
+task-specific adaptability of gait recognition, promoting more efficient and
+reliable gait-based biometrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 Figures, 1 Table. Short Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenPatch: a 3D patchwork for Out-Of-Distribution detectionpdf icon 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Rabino, Antonio Alliegro, Francesco Cappio Borlino, Tatiana Tommasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Moving deep learning models from the laboratory setting to the open world
+entails preparing them to handle unforeseen conditions. In several applications
+the occurrence of novel classes during deployment poses a significant threat,
+thus it is crucial to effectively detect them. Ideally, this skill should be
+used when needed without requiring any further computational training effort at
+every new task. Out-of-distribution detection has attracted significant
+attention in the last years, however the majority of the studies deal with 2D
+images ignoring the inherent 3D nature of the real-world and often confusing
+between domain and semantic novelty. In this work, we focus on the latter,
+considering the objects geometric structure captured by 3D point clouds
+regardless of the specific domain. We advance the field by introducing
+OpenPatch that builds on a large pre-trained model and simply extracts from its
+intermediate features a set of patch representations that describe each known
+class. For any new sample, we obtain a novelty score by evaluating whether it
+can be recomposed mainly by patches of a single known class or rather via the
+contribution of multiple classes. We present an extensive experimental
+evaluation of our approach for the task of semantic novelty detection on
+real-world point cloud samples when the reference known data are synthetic. We
+demonstrate that OpenPatch excels in both the full and few-shot known sample
+scenarios, showcasing its robustness across varying pre-training objectives and
+network backbones. The inherent training-free nature of our method allows for
+its immediate application to a wide array of real-world tasks, offering a
+compelling advantage over approaches that need expensive retraining efforts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACT-Net: Anchor-context Action Detection in Surgery Videos <span class="chip">MICCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luoying Hao, Yan Hu, Wenjun Lin, Qun Wang, Heng Li, Huazhu Fu, Jinming Duan, Jiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognition and localization of surgical detailed actions is an essential
+component of developing a context-aware decision support system. However, most
+existing detection algorithms fail to provide high-accuracy action classes even
+having their locations, as they do not consider the surgery procedure's
+regularity in the whole video. This limitation hinders their application.
+Moreover, implementing the predictions in clinical applications seriously needs
+to convey model confidence to earn entrustment, which is unexplored in surgical
+action prediction. In this paper, to accurately detect fine-grained actions
+that happen at every moment, we propose an anchor-context action detection
+network (ACTNet), including an anchor-context detection (ACD) module and a
+class conditional diffusion (CCD) module, to answer the following questions: 1)
+where the actions happen; 2) what actions are; 3) how confidence predictions
+are. Specifically, the proposed ACD module spatially and temporally highlights
+the regions interacting with the extracted anchor in surgery video, which
+outputs action location and its class distribution based on anchor-context
+interactions. Considering the full distribution of action classes in videos,
+the CCD module adopts a denoising diffusion-based generative model conditioned
+on our ACD estimator to further reconstruct accurately the action predictions.
+Moreover, we utilize the stochastic nature of the diffusion model outputs to
+access model confidence for each prediction. Our method reports the
+state-of-the-art performance, with improvements of 4.0% mAP against baseline on
+the surgical video dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted early by MICCAI2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point-Based Radiance Fields for Controllable Human Motion Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitao Yu, Deheng Zhang, Peiyuan Xie, Tianyi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel controllable human motion synthesis method for
+fine-level deformation based on static point-based radiance fields. Although
+previous editable neural radiance field methods can generate impressive results
+on novel-view synthesis and allow naive deformation, few algorithms can achieve
+complex 3D human editing such as forward kinematics. Our method exploits the
+explicit point cloud to train the static 3D scene and apply the deformation by
+encoding the point cloud translation using a deformation MLP. To make sure the
+rendering result is consistent with the canonical space training, we estimate
+the local rotation using SVD and interpolate the per-point rotation to the
+query view direction of the pre-trained radiance field. Extensive experiments
+show that our approach can significantly outperform the state-of-the-art on
+fine-level complex deformation which can be generalized to other 3D characters
+besides humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video
+  Sequences Using Swin <span class="highlight-title">Transformer</span>-Enhanced UNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Jafari, Karim Faez, Hamidreza Amindavar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer is highly lethal, emphasizing the critical need for early
+detection. However, identifying lung nodules poses significant challenges for
+radiologists, who rely heavily on their expertise and experience for accurate
+diagnosis. To address this issue, computer-aided diagnosis systems based on
+machine learning techniques have emerged to assist doctors in identifying lung
+nodules from computed tomography (CT) scans. Unfortunately, existing networks
+in this domain often suffer from computational complexity, leading to high
+rates of false negatives and false positives, limiting their effectiveness. To
+address these challenges, we present an innovative model that harnesses the
+strengths of both convolutional neural networks and vision transformers.
+Inspired by object detection in videos, we treat each 3D CT image as a video,
+individual slices as frames, and lung nodules as objects, enabling a
+time-series application. The primary objective of our work is to overcome
+hardware limitations during model training, allowing for efficient processing
+of 2D data while utilizing inter-slice information for accurate identification
+based on 3D image context. We validated the proposed network by applying a
+10-fold cross-validation technique to the publicly available Lung Nodule
+Analysis 2016 dataset. Our proposed architecture achieves an average
+sensitivity criterion of 97.84% and a competition performance metrics (CPM) of
+96.0% with few parameters. Comparative analysis with state-of-the-art
+advancements in lung nodule identification demonstrates the significant
+accuracy achieved by our proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Realistic Speech-to-Face Generation with Speech-Conditioned Latent
+  Diffusion Model with Face Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinting Wang, Li Liu, Jun Wang, Hei Victor Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-to-face generation is an intriguing area of research that focuses on
+generating realistic facial images based on a speaker's audio speech. However,
+state-of-the-art methods employing GAN-based architectures lack stability and
+cannot generate realistic face images. To fill this gap, we propose a novel
+speech-to-face generation framework, which leverages a Speech-Conditioned
+Latent Diffusion Model, called SCLDM. To the best of our knowledge, this is the
+first work to harness the exceptional modeling capabilities of diffusion models
+for speech-to-face generation. Preserving the shared identity information
+between speech and face is crucial in generating realistic results. Therefore,
+we employ contrastive pre-training for both the speech encoder and the face
+encoder. This pre-training strategy facilitates effective alignment between the
+attributes of speech, such as age and gender, and the corresponding facial
+characteristics in the face images. Furthermore, we tackle the challenge posed
+by excessive diversity in the synthesis process caused by the diffusion model.
+To overcome this challenge, we introduce the concept of residuals by
+integrating a statistical face prior to the diffusion process. This addition
+helps to eliminate the shared component across the faces and enhances the
+subtle variations captured by the speech condition. Extensive quantitative,
+qualitative, and user study experiments demonstrate that our method can produce
+more realistic face images while preserving the identity of the speaker better
+than state-of-the-art methods. Highlighting the notable enhancements, our
+method demonstrates significant gains in all metrics on the AVSpeech dataset
+and Voxceleb dataset, particularly noteworthy are the improvements of 32.17 and
+32.72 on the cosine distance metric for the two datasets, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSI: Enhancing the Robustness of 3D Point Cloud Recognition against
+  Corruption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoyuan Wu, Jiachen Sun, Chaowei Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent advancements in deep neural networks for point cloud
+recognition, real-world safety-critical applications present challenges due to
+unavoidable data corruption. Current models often fall short in generalizing to
+unforeseen distribution shifts. In this study, we harness the inherent set
+property of point cloud data to introduce a novel critical subset
+identification (CSI) method, aiming to bolster recognition robustness in the
+face of data corruption. Our CSI framework integrates two pivotal components:
+density-aware sampling (DAS) and self-entropy minimization (SEM), which cater
+to static and dynamic CSI, respectively. DAS ensures efficient robust anchor
+point sampling by factoring in local density, while SEM is employed during
+training to accentuate the most salient point-to-point attention. Evaluations
+reveal that our CSI approach yields error rates of 18.4\% and 16.3\% on
+ModelNet40-C and PointCloud-C, respectively, marking a notable improvement over
+state-of-the-art methods by margins of 5.2\% and 4.2\% on the respective
+benchmarks. Code is available at
+\href{https://github.com/masterwu2115/CSI/tree/main}{https://github.com/masterwu2115/CSI/tree/main}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Representation Learning via Asymmetric Negative Contrast and
+  Reverse Attention <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuoyan Zhou, Decheng Liu, Dawei Zhou, Xinbo Gao, Nannan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to adversarial noise. Adversarial
+training (AT) has been demonstrated to be the most effective defense strategy
+to protect neural networks from being fooled. However, we find AT omits to
+learning robust features, resulting in poor performance of adversarial
+robustness. To address this issue, we highlight two characteristics of robust
+representation: (1) $\bf{exclusion}$: the feature of natural examples keeps
+away from that of other classes; (2) $\bf{alignment}$: the feature of natural
+and corresponding adversarial examples is close to each other. These motivate
+us to propose a generic framework of AT to gain robust representation, by the
+asymmetric negative contrast and reverse attention. Specifically, we design an
+asymmetric negative contrast based on predicted probabilities, to push away
+examples of different classes in the feature space. Moreover, we propose to
+weight feature by parameters of the linear classifier as the reverse attention,
+to obtain class-aware feature and pull close the feature of the same class.
+Empirical evaluations on three benchmark datasets show our methods greatly
+advance the robustness of AT and achieve state-of-the-art performance. Code is
+available at <https://github.com/changzhang777/ANCRA>.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining <span class="highlight-title">Dataset</span>s with Different Label Sets for Improved Nucleus
+  Segmentation and Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amruta Parulekar, Utkarsh Kanwat, Ravi Kant Gupta, Medha Chippa, Thomas Jacob, Tripti Bameta, Swapnil Rane, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation and classification of cell nuclei in histopathology images using
+deep neural networks (DNNs) can save pathologists' time for diagnosing various
+diseases, including cancers, by automating cell counting and morphometric
+assessments. It is now well-known that the accuracy of DNNs increases with the
+sizes of annotated datasets available for training. Although multiple datasets
+of histopathology images with nuclear annotations and class labels have been
+made publicly available, the set of class labels differ across these datasets.
+We propose a method to train DNNs for instance segmentation and classification
+on multiple datasets where the set of classes across the datasets are related
+but not the same. Specifically, our method is designed to utilize a
+coarse-to-fine class hierarchy, where the set of classes labeled and annotated
+in a dataset can be at any level of the hierarchy, as long as the classes are
+mutually exclusive. Within a dataset, the set of classes need not even be at
+the same level of the class hierarchy tree. Our results demonstrate that
+segmentation and classification metrics for the class set used by the test
+split of a dataset can improve by pre-training on another dataset that may even
+have a different set of classes due to the expansion of the training set
+enabled by our method. Furthermore, generalization to previously unseen
+datasets also improves by combining multiple other datasets with different sets
+of classes for training. The improvement is both qualitative and quantitative.
+The proposed method can be adapted for various loss functions, DNN
+architectures, and application domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Denoising Diffusion Step-aware Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yang, Yukang Chen, Luozhou Wang, Shu Liu, Yingcong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising Diffusion Probabilistic Models (DDPMs) have garnered popularity for
+data generation across various domains. However, a significant bottleneck is
+the necessity for whole-network computation during every step of the generative
+process, leading to high computational overheads. This paper presents a novel
+framework, Denoising Diffusion Step-aware Models (DDSM), to address this
+challenge. Unlike conventional approaches, DDSM employs a spectrum of neural
+networks whose sizes are adapted according to the importance of each generative
+step, as determined through evolutionary search. This step-wise network
+variation effectively circumvents redundant computational efforts, particularly
+in less critical steps, thereby enhancing the efficiency of the diffusion
+model. Furthermore, the step-aware design can be seamlessly integrated with
+other efficiency-geared diffusion models such as DDIMs and latent diffusion,
+thus broadening the scope of computational savings. Empirical evaluations
+demonstrate that DDSM achieves computational savings of 49% for CIFAR-10, 61%
+for CelebA-HQ, 59% for LSUN-bedroom, 71% for AFHQ, and 76% for ImageNet, all
+without compromising the generation quality. Our code and models will be
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Test-time Domain Adaptation via Dynamic Sample Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanshuo Wang, Jie Hong, Ali Cheraghian, Shafin Rahman, David Ahmedt-Aristizabal, Lars Petersson, Mehrtash Harandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of Continual Test-time Domain Adaptation (CTDA) is to gradually
+adapt a pre-trained model to a sequence of target domains without accessing the
+source data. This paper proposes a Dynamic Sample Selection (DSS) method for
+CTDA. DSS consists of dynamic thresholding, positive learning, and negative
+learning processes. Traditionally, models learn from unlabeled unknown
+environment data and equally rely on all samples' pseudo-labels to update their
+parameters through self-training. However, noisy predictions exist in these
+pseudo-labels, so all samples are not equally trustworthy. Therefore, in our
+method, a dynamic thresholding module is first designed to select suspected
+low-quality from high-quality samples. The selected low-quality samples are
+more likely to be wrongly predicted. Therefore, we apply joint positive and
+negative learning on both high- and low-quality samples to reduce the risk of
+using wrong information. We conduct extensive experiments that demonstrate the
+effectiveness of our proposed method for CTDA in the image domain,
+outperforming the state-of-the-art results. Furthermore, our approach is also
+evaluated in the 3D point cloud domain, showcasing its versatility and
+potential for broader applicability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Multi-modal Object Detection and Tracking on Edge for
+  Regulatory Compliance Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Syuen Lim, Ziwei Wang, Jiajun Liu, Abdelwahed Khamis, Reza Arablouei, Robert Barlow, Ryan McAllister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regulatory compliance auditing across diverse industrial domains requires
+heightened quality assurance and traceability. Present manual and intermittent
+approaches to such auditing yield significant challenges, potentially leading
+to oversights in the monitoring process. To address these issues, we introduce
+a real-time, multi-modal sensing system employing 3D time-of-flight and RGB
+cameras, coupled with unsupervised learning techniques on edge AI devices. This
+enables continuous object tracking thereby enhancing efficiency in
+record-keeping and minimizing manual interventions. While we validate the
+system in a knife sanitization context within agrifood facilities, emphasizing
+its prowess against occlusion and low-light issues with RGB cameras, its
+potential spans various industrial monitoring settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Concept-Based Visual Causal Transition and Symbolic Reasoning
+  for Visual Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilue Qian, Peiyu Yu, Ying Nian Wu, Wei Wang, Lifeng Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual planning simulates how humans make decisions to achieve desired goals
+in the form of searching for visual causal transitions between an initial
+visual state and a final visual goal state. It has become increasingly
+important in egocentric vision with its advantages in guiding agents to perform
+daily tasks in complex environments. In this paper, we propose an interpretable
+and generalizable visual planning framework consisting of i) a novel
+Substitution-based Concept Learner (SCL) that abstracts visual inputs into
+disentangled concept representations, ii) symbol abstraction and reasoning that
+performs task planning via the self-learned symbols, and iii) a Visual Causal
+Transition model (ViCT) that grounds visual causal transitions to semantically
+similar real-world actions. Given an initial state, we perform goal-conditioned
+visual planning with a symbolic reasoning method fueled by the learned
+representations and causal transitions to reach the goal state. To verify the
+effectiveness of the proposed model, we collect a large-scale visual planning
+dataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this
+challenging dataset demonstrate the superior performance of our method in
+visual task planning. Empirically, we show that our framework can generalize to
+unseen task trajectories and unseen object categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Limitation of CLIP Models: The Worst-Performing
+  Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie-Jing Shao, Jiang-Xin Shi, Xiao-Wen Yang, Lan-Zhe Guo, Yu-Feng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) provides a foundation model by
+integrating natural language into visual concepts, enabling zero-shot
+recognition on downstream tasks. It is usually expected that satisfactory
+overall accuracy can be achieved across numerous domains through well-designed
+textual prompts. However, we found that their performance in the worst
+categories is significantly inferior to the overall performance. For example,
+on ImageNet, there are a total of 10 categories with class-wise accuracy as low
+as 0\%, even though the overall performance has achieved 64.1\%. This
+phenomenon reveals the potential risks associated with using CLIP models,
+particularly in risk-sensitive applications where specific categories hold
+significant importance. To address this issue, we investigate the alignment
+between the two modalities in the CLIP model and propose the Class-wise
+Matching Margin (\cmm) to measure the inference confusion. \cmm\ can
+effectively identify the worst-performing categories and estimate the potential
+performance of the candidate prompts. We further query large language models to
+enrich descriptions of worst-performing categories and build a weighted
+ensemble to highlight the efficient prompts. Experimental results clearly
+verify the effectiveness of our proposal, where the accuracy on the worst-10
+categories on ImageNet is boosted to 5.2\%, without manual prompt engineering,
+laborious optimization, or access to labeled validation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Human-Robot Collaboration using Constrained Probabilistic
+  Human-Motion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aadi Kothari, Tony Tohme, Xiaotong Zhang, Kamal Youcef-Toumi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction is an essential step for efficient and safe
+human-robot collaboration. Current methods either purely rely on representing
+the human joints in some form of neural network-based architecture or use
+regression models offline to fit hyper-parameters in the hope of capturing a
+model encompassing human motion. While these methods provide good initial
+results, they are missing out on leveraging well-studied human body kinematic
+models as well as body and scene constraints which can help boost the efficacy
+of these prediction frameworks while also explicitly avoiding implausible human
+joint configurations. We propose a novel human motion prediction framework that
+incorporates human joint constraints and scene constraints in a Gaussian
+Process Regression (GPR) model to predict human motion over a set time horizon.
+This formulation is combined with an online context-aware constraints model to
+leverage task-dependent motions. It is tested on a human arm kinematic model
+and implemented on a human-robot collaborative setup with a UR5 robot arm to
+demonstrate the real-time capability of our approach. Simulations were also
+performed on datasets like HA4M and ANDY. The simulation and experimental
+results demonstrate considerable improvements in a Gaussian Process framework
+when these constraints are explicitly considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures. Associated video demonstration can be found at
+  https://www.youtube.com/@MITMechatronics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can <span class="highlight-title">pre-train</span>ed models assist in <span class="highlight-title">dataset</span> distillation? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Lu, Xuguang Chen, Yuchen Zhang, Jianyang Gu, Tianle Zhang, Yifan Zhang, Xiaoniu Yang, Qi Xuan, Kai Wang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset Distillation (DD) is a prominent technique that encapsulates
+knowledge from a large-scale original dataset into a small synthetic dataset
+for efficient training. Meanwhile, Pre-trained Models (PTMs) function as
+knowledge repositories, containing extensive information from the original
+dataset. This naturally raises a question: Can PTMs effectively transfer
+knowledge to synthetic datasets, guiding DD accurately? To this end, we conduct
+preliminary experiments, confirming the contribution of PTMs to DD. Afterwards,
+we systematically study different options in PTMs, including initialization
+parameters, model architecture, training epoch and domain knowledge, revealing
+that: 1) Increasing model diversity enhances the performance of synthetic
+datasets; 2) Sub-optimal models can also assist in DD and outperform
+well-trained ones in certain cases; 3) Domain-specific PTMs are not mandatory
+for DD, but a reasonable domain match is crucial. Finally, by selecting optimal
+options, we significantly improve the cross-architecture generalization over
+baseline DD methods. We hope our work will facilitate researchers to develop
+better DD techniques. Our code is available at
+https://github.com/yaolu-zjut/DDInterpreter.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimVLG: Simple and Efficient <span class="highlight-title">Pretrain</span>ing of Visual Language Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiren Jian, Tingkai Liu, Yunzhe Tao, Soroush Vosoughi, HX Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose ``SimVLG'', a streamlined framework for the
+pre-training of computationally intensive vision-language generative models,
+leveraging frozen pre-trained large language models (LLMs). The prevailing
+paradigm in vision-language pre-training (VLP) typically involves a two-stage
+optimization process: an initial resource-intensive phase dedicated to
+general-purpose vision-language representation learning, aimed at extracting
+and consolidating pertinent visual features, followed by a subsequent phase
+focusing on end-to-end alignment between visual and linguistic modalities. Our
+one-stage, single-loss framework circumvents the aforementioned computationally
+demanding first stage of training by gradually merging similar visual tokens
+during training. This gradual merging process effectively compacts the visual
+information while preserving the richness of semantic content, leading to fast
+convergence without sacrificing performance. Our experiments show that our
+approach can speed up the training of vision-language models by a factor
+$\times 5$ without noticeable impact on the overall performance. Additionally,
+we show that our models can achieve comparable performance to current
+vision-language models with only $1/10$ of the data. Finally, we demonstrate
+how our image-text models can be easily adapted to video-language generative
+tasks through a novel soft attentive temporal token merging modules.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PoseAction: Action Recognition for Patients in the Ward using Deep
+  Learning Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zherui Li, Raye Chen-Hua Yeow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time intelligent detection and prediction of subjects' behavior
+particularly their movements or actions is critical in the ward. This approach
+offers the advantage of reducing in-hospital care costs and improving the
+efficiency of healthcare workers, which is especially true for scenarios at
+night or during peak admission periods. Therefore, in this work, we propose
+using computer vision (CV) and deep learning (DL) methods for detecting
+subjects and recognizing their actions. We utilize OpenPose as an accurate
+subject detector for recognizing the positions of human subjects in the video
+stream. Additionally, we employ AlphAction's Asynchronous Interaction
+Aggregation (AIA) network to predict the actions of detected subjects. This
+integrated model, referred to as PoseAction, is proposed. At the same time, the
+proposed model is further trained to predict 12 common actions in ward areas,
+such as staggering, chest pain, and falling down, using medical-related video
+clips from the NTU RGB+D and NTU RGB+D 120 datasets. The results demonstrate
+that PoseAction achieves the highest classification mAP of 98.72% (IoU@0.5).
+Additionally, this study develops an online real-time mode for action
+recognition, which strongly supports the clinical translation of PoseAction.
+Furthermore, using OpenPose's function for recognizing face key points, we also
+implement face blurring, which is a practical solution to address the privacy
+protection concerns of patients and healthcare workers. Nevertheless, the
+training data for PoseAction is currently limited, particularly in terms of
+label diversity. Consequently, the subsequent step involves utilizing a more
+diverse dataset (including general actions) to train the model's parameters for
+improved generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classifying Whole Slide Images: What Matters? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Nguyen, Aiden Nibali, Joshua Millward, Zhen He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently there have been many algorithms proposed for the classification of
+very high resolution whole slide images (WSIs). These new algorithms are mostly
+focused on finding novel ways to combine the information from small local
+patches extracted from the slide, with an emphasis on effectively aggregating
+more global information for the final predictor. In this paper we thoroughly
+explore different key design choices for WSI classification algorithms to
+investigate what matters most for achieving high accuracy. Surprisingly, we
+found that capturing global context information does not necessarily mean
+better performance. A model that captures the most global information
+consistently performs worse than a model that captures less global information.
+In addition, a very simple multi-instance learning method that captures no
+global information performs almost as well as models that capture a lot of
+global information. These results suggest that the most important features for
+effective WSI classification are captured at the local small patch level, where
+cell and tissue micro-environment detail is most pronounced. Another surprising
+finding was that unsupervised pre-training on a larger set of 33 cancers gives
+significantly worse performance compared to pre-training on a smaller dataset
+of 7 cancers (including the target cancer). We posit that pre-training on a
+smaller, more focused dataset allows the feature extractor to make better use
+of the limited feature space to better discriminate between subtle differences
+in the input patch.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ablation Study to Clarify the Mechanism of Object Segmentation in
+  Multi-Object Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takayuki Komatsu, Yoshiyuki Ohmura, Yasuo Kuniyoshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-object representation learning aims to represent complex real-world
+visual input using the composition of multiple objects. Representation learning
+methods have often used unsupervised learning to segment an input image into
+individual objects and encode these objects into each latent vector. However,
+it is not clear how previous methods have achieved the appropriate segmentation
+of individual objects. Additionally, most of the previous methods regularize
+the latent vectors using a Variational Autoencoder (VAE). Therefore, it is not
+clear whether VAE regularization contributes to appropriate object
+segmentation. To elucidate the mechanism of object segmentation in multi-object
+representation learning, we conducted an ablation study on MONet, which is a
+typical method. MONet represents multiple objects using pairs that consist of
+an attention mask and the latent vector corresponding to the attention mask.
+Each latent vector is encoded from the input image and attention mask. Then,
+the component image and attention mask are decoded from each latent vector. The
+loss function of MONet consists of 1) the sum of reconstruction losses between
+the input image and decoded component image, 2) the VAE regularization loss of
+the latent vector, and 3) the reconstruction loss of the attention mask to
+explicitly encode shape information. We conducted an ablation study on these
+three loss functions to investigate the effect on segmentation performance. Our
+results showed that the VAE regularization loss did not affect segmentation
+performance and the others losses did affect it. Based on this result, we
+hypothesize that it is important to maximize the attention mask of the image
+region best represented by a single latent vector corresponding to the
+attention mask. We confirmed this hypothesis by evaluating a new loss function
+with the same mechanism as the hypothesis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yefei He, Jing Liu, Weijia Wu, Hong Zhou, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated remarkable capabilities in image synthesis
+and related generative tasks. Nevertheless, their practicality for low-latency
+real-world applications is constrained by substantial computational costs and
+latency issues. Quantization is a dominant way to compress and accelerate
+diffusion models, where post-training quantization (PTQ) and quantization-aware
+training (QAT) are two main approaches, each bearing its own properties. While
+PTQ exhibits efficiency in terms of both time and data usage, it may lead to
+diminished performance in low bit-width. On the other hand, QAT can alleviate
+performance degradation but comes with substantial demands on computational and
+data resources. To capitalize on the advantages while avoiding their respective
+drawbacks, we introduce a data-free and parameter-efficient fine-tuning
+framework for low-bit diffusion models, dubbed EfficientDM, to achieve
+QAT-level performance with PTQ-like efficiency. Specifically, we propose a
+quantization-aware variant of the low-rank adapter (QALoRA) that can be merged
+with model weights and jointly quantized to low bit-width. The fine-tuning
+process distills the denoising capabilities of the full-precision model into
+its quantized counterpart, eliminating the requirement for training data. We
+also introduce scale-aware optimization and employ temporal learned step-size
+quantization to further enhance performance. Extensive experimental results
+demonstrate that our method significantly outperforms previous PTQ-based
+diffusion models while maintaining similar time and data efficiency.
+Specifically, there is only a marginal 0.05 sFID increase when quantizing both
+weights and activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to
+QAT-based methods, our EfficientDM also boasts a 16.2x faster quantization
+speed with comparable generation quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spuriosity Rankings: Sorting Data to Measure and Mitigate Biases <span class="chip">NeurIPS '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazda Moayeri, Wenxiao Wang, Sahil Singla, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple but effective method to measure and mitigate model biases
+caused by reliance on spurious cues. Instead of requiring costly changes to
+one's data or model training, our method better utilizes the data one already
+has by sorting them. Specifically, we rank images within their classes based on
+spuriosity (the degree to which common spurious cues are present), proxied via
+deep neural features of an interpretable network. With spuriosity rankings, it
+is easy to identify minority subpopulations (i.e. low spuriosity images) and
+assess model bias as the gap in accuracy between high and low spuriosity
+images. One can even efficiently remove a model's bias at little cost to
+accuracy by finetuning its classification head on low spuriosity images,
+resulting in fairer treatment of samples regardless of spuriosity. We
+demonstrate our method on ImageNet, annotating $5000$ class-feature
+dependencies ($630$ of which we find to be spurious) and generating a dataset
+of $325k$ soft segmentations for these features along the way. Having computed
+spuriosity rankings via the identified spurious neural features, we assess
+biases for $89$ diverse models and find that class-wise biases are highly
+correlated across models. Our results suggest that model bias due to spurious
+feature reliance is influenced far more by what the model is trained on than
+how it is trained.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS '23 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Realistic Zero-Shot Classification via Self Structural Semantic
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Zhang, Muzammal Naseer, Guangyi Chen, Zhiqiang Shen, Salman Khan, Kun Zhang, Fahad Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained Vision Language Models (VLMs) have proven effective
+for zero-shot classification. Despite the success, most traditional VLMs-based
+methods are restricted by the assumption of partial source supervision or ideal
+vocabularies, which rarely satisfy the open-world scenario. In this paper, we
+aim at a more challenging setting, Realistic Zero-Shot Classification, which
+assumes no annotation but instead a broad vocabulary. To address this
+challenge, we propose the Self Structural Semantic Alignment (S^3A) framework,
+which extracts the structural semantic information from unlabeled data while
+simultaneously self-learning. Our S^3A framework adopts a unique
+Cluster-Vote-Prompt-Realign (CVPR) algorithm, which iteratively groups
+unlabeled data to derive structural semantics for pseudo-supervision. Our CVPR
+process includes iterative clustering on images, voting within each cluster to
+identify initial class candidates from the vocabulary, generating
+discriminative prompts with large language models to discern confusing
+candidates, and realigning images and the vocabulary as structural semantic
+alignment. Finally, we propose to self-learn the CLIP image encoder with both
+individual and structural semantic alignment through a teacher-student learning
+strategy. Our comprehensive experiments across various generic and fine-grained
+benchmarks demonstrate that the S^3A method offers substantial improvements
+over existing VLMs-based approaches, achieving a more than 15% accuracy
+improvement over CLIP on average. Our codes, models, and prompts are publicly
+released at https://github.com/sheng-eatamath/S3A.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Understanding the Effect of <span class="highlight-title">Pretrain</span>ing Label Granularity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16887v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16887v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan Zhe Hong, Yin Cui, Ariel Fuxman, Stanley H. Chan, Enming Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study how the granularity of pretraining labels affects the
+generalization of deep neural networks in image classification tasks. We focus
+on the "fine-to-coarse" transfer learning setting, where the pretraining label
+space is more fine-grained than that of the target problem. Empirically, we
+show that pretraining on the leaf labels of ImageNet21k produces better
+transfer results on ImageNet1k than pretraining on other coarser granularity
+levels, which supports the common practice used in the community.
+Theoretically, we explain the benefit of fine-grained pretraining by proving
+that, for a data distribution satisfying certain hierarchy conditions, 1)
+coarse-grained pretraining only allows a neural network to learn the "common"
+or "easy-to-learn" features well, while 2) fine-grained pretraining helps the
+network learn the "rarer" or "fine-grained" features in addition to the common
+ones, thus improving its accuracy on hard downstream test samples in which
+common features are missing or weak in strength. Furthermore, we perform
+comprehensive experiments using the label hierarchies of iNaturalist 2021 and
+observe that the following conditions, in addition to proper choice of label
+granularity, enable the transfer to work well in practice: 1) the pretraining
+dataset needs to have a meaningful label hierarchy, and 2) the pretraining and
+target label functions need to align well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chatting Makes Perfect: Chat-based Image Retrieval <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.20062v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.20062v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matan Levy, Rami Ben-Ari, Nir Darshan, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chats emerge as an effective user-friendly approach for information
+retrieval, and are successfully employed in many domains, such as customer
+service, healthcare, and finance. However, existing image retrieval approaches
+typically address the case of a single query-to-image round, and the use of
+chats for image retrieval has been mostly overlooked. In this work, we
+introduce ChatIR: a chat-based image retrieval system that engages in a
+conversation with the user to elicit information, in addition to an initial
+query, in order to clarify the user's search intent. Motivated by the
+capabilities of today's foundation models, we leverage Large Language Models to
+generate follow-up questions to an initial image description. These questions
+form a dialog with the user in order to retrieve the desired image from a large
+corpus. In this study, we explore the capabilities of such a system tested on a
+large dataset and reveal that engaging in a dialog yields significant gains in
+image retrieval. We start by building an evaluation pipeline from an existing
+manually generated dataset and explore different modules and training
+strategies for ChatIR. Our comparison includes strong baselines derived from
+related applications trained with Reinforcement Learning. Our system is capable
+of retrieving the target image from a pool of 50K images with over 78% success
+rate after 5 dialogue rounds, compared to 75% when questions are asked by
+humans, and 64% for a single shot text-to-image retrieval. Extensive
+evaluations reveal the strong capabilities and examine the limitations of
+CharIR under different settings. Project repository is available at
+https://github.com/levymsn/ChatIR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera Ready version for NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PMSSC: Parallelizable multi-subset based self-expressive model for
+  subspace clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.12232v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.12232v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katsuya Hotta, Takuya Akashi, Shogo Tokai, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Subspace clustering methods which embrace a self-expressive model that
+represents each data point as a linear combination of other data points in the
+dataset provide powerful unsupervised learning techniques. However, when
+dealing with large datasets, representation of each data point by referring to
+all data points via a dictionary suffers from high computational complexity. To
+alleviate this issue, we introduce a parallelizable multi-subset based
+self-expressive model (PMS) which represents each data point by combining
+multiple subsets, with each consisting of only a small proportion of the
+samples. The adoption of PMS in subspace clustering (PMSSC) leads to
+computational advantages because the optimization problems decomposed over each
+subset are small, and can be solved efficiently in parallel. Furthermore, PMSSC
+is able to combine multiple self-expressive coefficient vectors obtained from
+subsets, which contributes to an improvement in self-expressiveness. Extensive
+experiments on synthetic and real-world datasets show the efficiency and
+effectiveness of our approach in comparison to other methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boost Video Frame Interpolation via Motion Adaptation <span class="chip">BMVC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13933v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13933v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoning Wu, Xiaoyun Zhang, Weidi Xie, Ya Zhang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video frame interpolation (VFI) is a challenging task that aims to generate
+intermediate frames between two consecutive frames in a video. Existing
+learning-based VFI methods have achieved great success, but they still suffer
+from limited generalization ability due to the limited motion distribution of
+training datasets. In this paper, we propose a novel optimization-based VFI
+method that can adapt to unseen motions at test time. Our method is based on a
+cycle-consistency adaptation strategy that leverages the motion characteristics
+among video frames. We also introduce a lightweight adapter that can be
+inserted into the motion estimation module of existing pre-trained VFI models
+to improve the efficiency of adaptation. Extensive experiments on various
+benchmarks demonstrate that our method can boost the performance of two-frame
+VFI models, outperforming the existing state-of-the-art methods, even those
+that use extra input.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by BMVC 2023 (Oral Presentation) Project Page:
+  https://haoningwu3639.github.io/VFI_Adapter_Webpage/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RedMotion: Motion Prediction via Redundancy Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Royden Wagner, Omer Sahin Tas, Marvin Klemp, Carlos Fernandez Lopez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the future motion of traffic agents is vital for self-driving
+vehicles to ensure their safe operation. We introduce RedMotion, a transformer
+model for motion prediction that incorporates two types of redundancy
+reduction. The first type of redundancy reduction is induced by an internal
+transformer decoder and reduces a variable-sized set of road environment
+tokens, such as road graphs with agent data, to a fixed-sized embedding. The
+second type of redundancy reduction is a self-supervised learning objective and
+applies the redundancy reduction principle to embeddings generated from
+augmented views of road environments. Our experiments reveal that our
+representation learning approach can outperform PreTraM, Traj-MAE, and
+GraphDINO in a semi-supervised setting. Our RedMotion model achieves results
+that are competitive with those of Scene Transformer or MTR++. We provide an
+open source implementation that is accessible via GitHub
+(https://github.com/kit-mrt/red-motion) and Colab
+(https://colab.research.google.com/drive/1Q-Z9VdiqvfPfctNG8oqzPcgm0lP3y1il).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report, 13 pages, 8 figures; v2: focus on transformer model</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhening Huang, Xiaoyang Wu, Xi Chen, Hengshuang Zhao, Lei Zhu, Joan Lasenby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current 3D open-vocabulary scene understanding methods mostly utilize
+well-aligned 2D images as the bridge to learn 3D features with language.
+However, applying these approaches becomes challenging in scenarios where 2D
+images are absent. In this work, we introduce a new pipeline, namely,
+OpenIns3D, which requires no 2D image inputs, for 3D open-vocabulary scene
+understanding at the instance level. The OpenIns3D framework employs a
+"Mask-Snap-Lookup" scheme. The "Mask" module learns class-agnostic mask
+proposals in 3D point clouds. The "Snap" module generates synthetic scene-level
+images at multiple scales and leverages 2D vision language models to extract
+interesting objects. The "Lookup" module searches through the outcomes of
+"Snap" with the help of Mask2Pixel maps, which contain the precise
+correspondence between 3D masks and synthetic images, to assign category names
+to the proposed masks. This 2D input-free and flexible approach achieves
+state-of-the-art results on a wide range of indoor and outdoor datasets by a
+large margin. Moreover, OpenIns3D allows for effortless switching of 2D
+detectors without re-training. When integrated with powerful 2D open-world
+models such as ODISE and GroundingDINO, excellent results were observed on
+open-vocabulary instance segmentation. When integrated with LLM-powered 2D
+models like LISA, it demonstrates a remarkable capacity to process highly
+complex text queries which require intricate reasoning and world knowledge.
+Project page: https://zheninghuang.github.io/OpenIns3D/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 17 figures, 13 tables. Project page:
+  https://zheninghuang.github.io/OpenIns3D/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Eliminating Contextual Prior Bias for Semantic Image Editing via
+  Dual-Cycle Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02394v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02394v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuopeng Yang, Tianshu Chu, Xin Lin, Erdun Gao, Daqing Liu, Jie Yang, Chaoyue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent success of text-to-image generation diffusion models has also
+revolutionized semantic image editing, enabling the manipulation of images
+based on query/target texts. Despite these advancements, a significant
+challenge lies in the potential introduction of contextual prior bias in
+pre-trained models during image editing, e.g., making unexpected modifications
+to inappropriate regions. To address this issue, we present a novel approach
+called Dual-Cycle Diffusion, which generates an unbiased mask to guide image
+editing. The proposed model incorporates a Bias Elimination Cycle that consists
+of both a forward path and an inverted path, each featuring a Structural
+Consistency Cycle to ensure the preservation of image content during the
+editing process. The forward path utilizes the pre-trained model to produce the
+edited image, while the inverted path converts the result back to the source
+image. The unbiased mask is generated by comparing differences between the
+processed source image and the edited image to ensure that both conform to the
+same distribution. Our experiments demonstrate the effectiveness of the
+proposed method, as it significantly improves the D-CLIP score from 0.272 to
+0.283. The code will be available at
+https://github.com/JohnDreamer/DualCycleDiffsion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by the IEEE Transactions on Circuits and
+  Systems for Video Technology (TCSVT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Facade Parsing with Vision <span class="highlight-title">Transformer</span>s and Line Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15523v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15523v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wang, Jiaxing Zhang, Ran Zhang, Yunqin Li, Liangzhi Li, Yuta Nakashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facade parsing stands as a pivotal computer vision task with far-reaching
+applications in areas like architecture, urban planning, and energy efficiency.
+Despite the recent success of deep learning-based methods in yielding
+impressive results on certain open-source datasets, their viability for
+real-world applications remains uncertain. Real-world scenarios are
+considerably more intricate, demanding greater computational efficiency.
+Existing datasets often fall short in representing these settings, and previous
+methods frequently rely on extra models to enhance accuracy, which requires
+much computation cost. In this paper, we introduce Comprehensive Facade Parsing
+(CFP), a dataset meticulously designed to encompass the intricacies of
+real-world facade parsing tasks. Comprising a total of 602 high-resolution
+street-view images, this dataset captures a diverse array of challenging
+scenarios, including sloping angles and densely clustered buildings, with
+painstakingly curated annotations for each image. We introduce a new pipeline
+known as Revision-based Transformer Facade Parsing (RTFP). This marks the
+pioneering utilization of Vision Transformers (ViT) in facade parsing, and our
+experimental results definitively substantiate its merit. We also design Line
+Acquisition, Filtering, and Revision (LAFR), an efficient yet accurate revision
+algorithm that can improve the segment result solely from simple line detection
+using prior knowledge of the facade. In ECP 2011, RueMonge 2014, and our CFP,
+we evaluate the superiority of our method. The dataset and code are available
+at https://github.com/wbw520/RTFP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling the Link Between Image Statistics and Human Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09874v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09874v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Hepburn, Valero Laparra, Raúl Santos-Rodriguez, Jesús Malo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the 1950s, Barlow and Attneave hypothesised a link between biological
+vision and information maximisation. Following Shannon, information was defined
+using the probability of natural images. A number of physiological and
+psychophysical phenomena have been derived ever since from principles like
+info-max, efficient coding, or optimal denoising. However, it remains unclear
+how this link is expressed in mathematical terms from image probability. First,
+classical derivations were subjected to strong assumptions on the probability
+models and on the behaviour of the sensors. Moreover, the direct evaluation of
+the hypothesis was limited by the inability of the classical image models to
+deliver accurate estimates of the probability. In this work we directly
+evaluate image probabilities using an advanced generative model for natural
+images, and we analyse how probability-related factors can be combined to
+predict human perception via sensitivity of state-of-the-art subjective image
+quality metrics. We use information theory and regression analysis to find a
+combination of just two probability-related factors that achieves 0.8
+correlation with subjective metrics. This probability-based sensitivity is
+psychophysically validated by reproducing the basic trends of the Contrast
+Sensitivity Function, its suprathreshold variation, and trends of the Weber-law
+and masking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alzheimer's Disease Prediction via Brain Structural-Functional Deep
+  Fusing Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiankun Zuo, Junren Pan, Shuqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fusing structural-functional images of the brain has shown great potential to
+analyze the deterioration of Alzheimer's disease (AD). However, it is a big
+challenge to effectively fuse the correlated and complementary information from
+multimodal neuroimages. In this paper, a novel model termed cross-modal
+transformer generative adversarial network (CT-GAN) is proposed to effectively
+fuse the functional and structural information contained in functional magnetic
+resonance imaging (fMRI) and diffusion tensor imaging (DTI). The CT-GAN can
+learn topological features and generate multimodal connectivity from multimodal
+imaging data in an efficient end-to-end manner. Moreover, the swapping
+bi-attention mechanism is designed to gradually align common features and
+effectively enhance the complementary features between modalities. By analyzing
+the generated connectivity features, the proposed model can identify AD-related
+brain connections. Evaluations on the public ADNI dataset show that the
+proposed CT-GAN can dramatically improve prediction performance and detect
+AD-related brain regions effectively. The proposed model also provides new
+insights for detecting AD-related abnormal neural circuits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversity in deep generative models and generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.09573v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.09573v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Turinici
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The decoder-based machine learning generative algorithms such as Generative
+Adversarial Networks (GAN), Variational Auto-Encoders (VAE), Transformers show
+impressive results when constructing objects similar to those in a training
+ensemble. However, the generation of new objects builds mainly on the
+understanding of the hidden structure of the training dataset followed by a
+sampling from a multi-dimensional normal variable. In particular each sample is
+independent from the others and can repeatedly propose same type of objects. To
+cure this drawback we introduce a kernel-based measure quantization method that
+can produce new objects from a given target measure by approximating it as a
+whole and even staying away from elements already drawn from that distribution.
+This ensures a better diversity of the produced objects. The method is tested
+on classic machine learning benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Detection of Backdoor Attacks via Density-based Clustering and
+  Centroids Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Guo, Benedetta Tondi, Mauro Barni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a Universal Defence against backdoor attacks based on Clustering
+and Centroids Analysis (CCA-UD). The goal of the defence is to reveal whether a
+Deep Neural Network model is subject to a backdoor attack by inspecting the
+training dataset. CCA-UD first clusters the samples of the training set by
+means of density-based clustering. Then, it applies a novel strategy to detect
+the presence of poisoned clusters. The proposed strategy is based on a general
+misclassification behaviour observed when the features of a representative
+example of the analysed cluster are added to benign samples. The capability of
+inducing a misclassification error is a general characteristic of poisoned
+samples, hence the proposed defence is attack-agnostic. This marks a
+significant difference with respect to existing defences, that, either can
+defend against only some types of backdoor attacks, or are effective only when
+some conditions on the poisoning ratio or the kind of triggering signal used by
+the attacker are satisfied.
+  Experiments carried out on several classification tasks and network
+architectures, considering different types of backdoor attacks (with either
+clean or corrupted labels), and triggering signals, including both global and
+local triggering signals, as well as sample-specific and source-specific
+triggers, reveal that the proposed method is very effective to defend against
+backdoor attacks in all the cases, always outperforming the state of the art
+techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Hidden Language of Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00966v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00966v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hila Chefer, Oran Lang, Mor Geva, Volodymyr Polosukhin, Assaf Shocher, Michal Irani, Inbar Mosseri, Lior Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models have demonstrated an unparalleled ability to
+generate high-quality, diverse images from a textual prompt. However, the
+internal representations learned by these models remain an enigma. In this
+work, we present Conceptor, a novel method to interpret the internal
+representation of a textual concept by a diffusion model. This interpretation
+is obtained by decomposing the concept into a small set of human-interpretable
+textual elements. Applied over the state-of-the-art Stable Diffusion model,
+Conceptor reveals non-trivial structures in the representations of concepts.
+For example, we find surprising visual connections between concepts, that
+transcend their textual semantics. We additionally discover concepts that rely
+on mixtures of exemplars, biases, renowned artistic styles, or a simultaneous
+fusion of multiple meanings of the concept. Through a large battery of
+experiments, we demonstrate Conceptor's ability to provide meaningful, robust,
+and faithful decompositions for a wide variety of abstract, concrete, and
+complex textual concepts, while allowing to naturally connect each
+decomposition element to its corresponding visual impact on the generated
+images. Our code will be available at: https://hila-chefer.github.io/Conceptor/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Anatomical Labeling of Pulmonary Tree Structures via Implicit
+  Point-Graph Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangxian Xie, Jiancheng Yang, Donglai Wei, Ziqiao Weng, Pascal Fua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pulmonary diseases rank prominently among the principal causes of death
+worldwide. Curing them will require, among other things, a better understanding
+of the many complex 3D tree-shaped structures within the pulmonary system, such
+as airways, arteries, and veins. In theory, they can be modeled using
+high-resolution image stacks. Unfortunately, standard CNN approaches operating
+on dense voxel grids are prohibitively expensive. To remedy this, we introduce
+a point-based approach that preserves graph connectivity of tree skeleton and
+incorporates an implicit surface representation. It delivers SOTA accuracy at a
+low computational cost and the resulting models have usable surfaces. Due to
+the scarcity of publicly accessible data, we have also curated an extensive
+dataset to evaluate our approach and will make it public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modality Unifying Network for Visible-Infrared Person Re-Identification <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06262v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06262v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Yu, Xu Cheng, Wei Peng, Weihao Liu, Guoying Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visible-infrared person re-identification (VI-ReID) is a challenging task due
+to large cross-modality discrepancies and intra-class variations. Existing
+methods mainly focus on learning modality-shared representations by embedding
+different modalities into the same feature space. As a result, the learned
+feature emphasizes the common patterns across modalities while suppressing
+modality-specific and identity-aware information that is valuable for Re-ID. To
+address these issues, we propose a novel Modality Unifying Network (MUN) to
+explore a robust auxiliary modality for VI-ReID. First, the auxiliary modality
+is generated by combining the proposed cross-modality learner and
+intra-modality learner, which can dynamically model the modality-specific and
+modality-shared representations to alleviate both cross-modality and
+intra-modality variations. Second, by aligning identity centres across the
+three modalities, an identity alignment loss function is proposed to discover
+the discriminative feature representations. Third, a modality alignment loss is
+introduced to consistently reduce the distribution distance of visible and
+infrared images by modality prototype modeling. Extensive experiments on
+multiple public datasets demonstrate that the proposed method surpasses the
+current state-of-the-art methods by a significant margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures. Accepted as the poster paper in ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ V3Det: Vast Vocabulary Visual Detection <span class="highlight-title">Dataset</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Wang, Pan Zhang, Tao Chu, Yuhang Cao, Yujie Zhou, Tong Wu, Bin Wang, Conghui He, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in detecting arbitrary objects in the real world are trained
+and evaluated on object detection datasets with a relatively restricted
+vocabulary. To facilitate the development of more general visual object
+detection, we propose V3Det, a vast vocabulary visual detection dataset with
+precisely annotated bounding boxes on massive images. V3Det has several
+appealing properties: 1) Vast Vocabulary: It contains bounding boxes of objects
+from 13,204 categories on real-world images, which is 10 times larger than the
+existing large vocabulary object detection dataset, e.g., LVIS. 2) Hierarchical
+Category Organization: The vast vocabulary of V3Det is organized by a
+hierarchical category tree which annotates the inclusion relationship among
+categories, encouraging the exploration of category relationships in vast and
+open vocabulary object detection. 3) Rich Annotations: V3Det comprises
+precisely annotated objects in 243k images and professional descriptions of
+each category written by human experts and a powerful chatbot. By offering a
+vast exploration space, V3Det enables extensive benchmarks on both vast and
+open vocabulary object detection, leading to new observations, practices, and
+insights for future research. It has the potential to serve as a cornerstone
+dataset for developing more general visual perception systems. V3Det is
+available at https://v3det.openxlab.org.cn/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Oral Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdvRain: Adversarial Raindrops to Attack Camera-based Smart Vision
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01338v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01338v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amira Guesmi, Muhammad Abdullah Hanif, Muhammad Shafique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based perception modules are increasingly deployed in many
+applications, especially autonomous vehicles and intelligent robots. These
+modules are being used to acquire information about the surroundings and
+identify obstacles. Hence, accurate detection and classification are essential
+to reach appropriate decisions and take appropriate and safe actions at all
+times. Current studies have demonstrated that "printed adversarial attacks",
+known as physical adversarial attacks, can successfully mislead perception
+models such as object detectors and image classifiers. However, most of these
+physical attacks are based on noticeable and eye-catching patterns for
+generated perturbations making them identifiable/detectable by human eye or in
+test drives. In this paper, we propose a camera-based inconspicuous adversarial
+attack (\textbf{AdvRain}) capable of fooling camera-based perception systems
+over all objects of the same class. Unlike mask based fake-weather attacks that
+require access to the underlying computing hardware or image memory, our attack
+is based on emulating the effects of a natural weather condition (i.e.,
+Raindrops) that can be printed on a translucent sticker, which is externally
+placed over the lens of a camera. To accomplish this, we provide an iterative
+process based on performing a random search aiming to identify critical
+positions to make sure that the performed transformation is adversarial for a
+target classifier. Our transformation is based on blurring predefined parts of
+the captured image corresponding to the areas covered by the raindrop. We
+achieve a drop in average model accuracy of more than $45\%$ and $40\%$ on
+VGG19 for ImageNet and Resnet34 for Caltech-101, respectively, using only $20$
+raindrops.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assessment of the Reliablity of a Model's Decision by Generalizing
+  Attribution to the Wavelet Domain <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14979v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14979v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Kasmi, Laurent Dubus, Yves-Marie Saint Drenan, Philippe Blanc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have shown remarkable performance in computer vision, but
+their deployment in numerous scientific and technical fields is challenging due
+to their black-box nature. Scientists and practitioners need to evaluate the
+reliability of a decision, i.e., to know simultaneously if a model relies on
+the relevant features and whether these features are robust to image
+corruptions. Existing attribution methods aim to provide human-understandable
+explanations by highlighting important regions in the image domain, but fail to
+fully characterize a decision process's reliability. To bridge this gap, we
+introduce the Wavelet sCale Attribution Method (WCAM), a generalization of
+attribution from the pixel domain to the space-scale domain using wavelet
+transforms. Attribution in the wavelet domain reveals where {\it and} on what
+scales the model focuses, thus enabling us to assess whether a decision is
+reliable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures, 2 tables. v1 of the manuscript rejected from
+  NeurIPS 2023, mainly due to the lack of quantitative evidence of the
+  relevance of the proposed methodology. In the v2, we propose steps to address
+  this issue and also plan on expanding the insertion and deletion scores for
+  our method</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Next-Active Objects for Context-Aware Anticipation in
+  Egocentric Videos <span class="chip">WACV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08303v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08303v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanket Thakur, Cigdem Beyan, Pietro Morerio, Vittorio Murino, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objects are crucial for understanding human-object interactions. By
+identifying the relevant objects, one can also predict potential future
+interactions or actions that may occur with these objects. In this paper, we
+study the problem of Short-Term Object interaction anticipation (STA) and
+propose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a
+multi-modal end-to-end transformer network, that attends to objects in observed
+frames in order to anticipate the next-active-object (NAO) and, eventually, to
+guide the model to predict context-aware future actions. The task is
+challenging since it requires anticipating future action along with the object
+with which the action occurs and the time after which the interaction will
+begin, a.k.a. the time to contact (TTC). Compared to existing video modeling
+architectures for action anticipation, NAOGAT captures the relationship between
+objects and the global scene context in order to predict detections for the
+next active object and anticipate relevant future actions given these
+detections, leveraging the objects' dynamics to improve accuracy. One of the
+key strengths of our approach, in fact, is its ability to exploit the motion
+dynamics of objects within a given clip, which is often ignored by other
+models, and separately decoding the object-centric and motion-centric
+information. Through our experiments, we show that our model outperforms
+existing methods on two separate datasets, Ego4D and EpicKitchens-100 ("Unseen
+Set"), as measured by several additional metrics, such as time to contact, and
+next-active-object localization. The code will be available upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in WACV'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Masked Convolutional <span class="highlight-title">Transformer</span> Block for Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.12148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.12148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neelu Madan, Nicolae-Catalin Ristea, Radu Tudor Ionescu, Kamal Nasrollahi, Fahad Shahbaz Khan, Thomas B. Moeslund, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection has recently gained increasing attention in the field of
+computer vision, likely due to its broad set of applications ranging from
+product fault detection on industrial production lines and impending event
+detection in video surveillance to finding lesions in medical scans. Regardless
+of the domain, anomaly detection is typically framed as a one-class
+classification task, where the learning is conducted on normal examples only.
+An entire family of successful anomaly detection methods is based on learning
+to reconstruct masked normal inputs (e.g. patches, future frames, etc.) and
+exerting the magnitude of the reconstruction error as an indicator for the
+abnormality level. Unlike other reconstruction-based methods, we present a
+novel self-supervised masked convolutional transformer block (SSMCTB) that
+comprises the reconstruction-based functionality at a core architectural level.
+The proposed self-supervised block is extremely flexible, enabling information
+masking at any layer of a neural network and being compatible with a wide range
+of neural architectures. In this work, we extend our previous self-supervised
+predictive convolutional attentive block (SSPCAB) with a 3D masked
+convolutional layer, a transformer for channel-wise attention, as well as a
+novel self-supervised objective based on Huber loss. Furthermore, we show that
+our block is applicable to a wider variety of tasks, adding anomaly detection
+in medical images and thermal videos to the previously considered tasks based
+on RGB images and surveillance videos. We exhibit the generality and
+flexibility of SSMCTB by integrating it into multiple state-of-the-art neural
+models for anomaly detection, bringing forth empirical results that confirm
+considerable performance improvements on five benchmarks. We release our code
+and data as open source at: https://github.com/ristea/ssmctb.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards domain-invariant <span class="highlight-title">Self-Supervised</span> Learning with Batch Styles
+  Standardization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06088v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06088v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marin Scalbert, Maria Vakalopoulou, Florent Couzinié-Devy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Self-Supervised Learning (SSL), models are typically pretrained,
+fine-tuned, and evaluated on the same domains. However, they tend to perform
+poorly when evaluated on unseen domains, a challenge that Unsupervised Domain
+Generalization (UDG) seeks to address. Current UDG methods rely on domain
+labels, which are often challenging to collect, and domain-specific
+architectures that lack scalability when confronted with numerous domains,
+making the current methodology impractical and rigid. Inspired by
+contrastive-based UDG methods that mitigate spurious correlations by
+restricting comparisons to examples from the same domain, we hypothesize that
+eliminating style variability within a batch could provide a more convenient
+and flexible way to reduce spurious correlations without requiring domain
+labels. To verify this hypothesis, we introduce Batch Styles Standardization
+(BSS), a relatively simple yet powerful Fourier-based method to standardize the
+style of images in a batch specifically designed for integration with SSL
+methods to tackle UDG. Combining BSS with existing SSL methods offers serious
+advantages over prior UDG methods: (1) It eliminates the need for domain labels
+or domain-specific network components to enhance domain-invariance in SSL
+representations, and (2) offers flexibility as BSS can be seamlessly integrated
+with diverse contrastive-based but also non-contrastive-based SSL methods.
+Experiments on several UDG datasets demonstrate that it significantly improves
+downstream task performances on unseen domains, often outperforming or rivaling
+with UDG methods. Finally, this work clarifies the underlying mechanisms
+contributing to BSS's effectiveness in improving domain-invariance in SSL
+representations and performances on unseen domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review as conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-label Image Classification using Adaptive Graph Convolutional
+  Networks: from a Single Domain to Multiple Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Indel Pal Singh, Enjie Ghorbel, Oyebade Oyedotun, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes an adaptive graph-based approach for multi-label image
+classification. Graph-based methods have been largely exploited in the field of
+multi-label classification, given their ability to model label correlations.
+Specifically, their effectiveness has been proven not only when considering a
+single domain but also when taking into account multiple domains. However, the
+topology of the used graph is not optimal as it is pre-defined heuristically.
+In addition, consecutive Graph Convolutional Network (GCN) aggregations tend to
+destroy the feature similarity. To overcome these issues, an architecture for
+learning the graph connectivity in an end-to-end fashion is introduced. This is
+done by integrating an attention-based mechanism and a similarity-preserving
+strategy. The proposed framework is then extended to multiple domains using an
+adversarial training scheme. Numerous experiments are reported on well-known
+single-domain and multi-domain benchmarks. The results demonstrate that our
+approach achieves competitive results in terms of mean Average Precision (mAP)
+and model size as compared to the state-of-the-art. The code will be made
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NAISR: A 3D Neural Additive Model for Interpretable Shape Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09234v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09234v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Jiao, Carlton Zdanski, Julia Kimbell, Andrew Prince, Cameron Worden, Samuel Kirse, Christopher Rutter, Benjamin Shields, William Dunn, Jisan Mahmud, Marc Niethammer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep implicit functions (DIFs) have emerged as a powerful paradigm for many
+computer vision tasks such as 3D shape reconstruction, generation,
+registration, completion, editing, and understanding. However, given a set of
+3D shapes with associated covariates there is at present no shape
+representation method which allows to precisely represent the shapes while
+capturing the individual dependencies on each covariate. Such a method would be
+of high utility to researchers to discover knowledge hidden in a population of
+shapes. For scientific shape discovery, we propose a 3D Neural Additive Model
+for Interpretable Shape Representation ($\texttt{NAISR}$) which describes
+individual shapes by deforming a shape atlas in accordance to the effect of
+disentangled covariates. Our approach captures shape population trends and
+allows for patient-specific predictions through shape transfer.
+$\texttt{NAISR}$ is the first approach to combine the benefits of deep implicit
+shape representations with an atlas deforming according to specified
+covariates. We evaluate $\texttt{NAISR}$ with respect to shape reconstruction,
+shape disentanglement, shape evolution, and shape transfer on three datasets:
+1) $\textit{Starman}$, a simulated 2D shape dataset; 2) the ADNI hippocampus 3D
+shape dataset; and 3) a pediatric airway 3D shape dataset. Our experiments
+demonstrate that $\textit{Starman}$ achieves excellent shape reconstruction
+performance while retaining interpretability. Our code is available at
+$\href{https://github.com/uncbiag/NAISR}{https://github.com/uncbiag/NAISR}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Building Flyweight FLIM-based CNNs with Adaptive Decoding for Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo de Melo Joao, Azael de Melo e Sousa, Bianca Martins dos Santos, Silvio Jamil Ferzoli Guimaraes, Jancarlo Ferreira Gomes, Ewa Kijak, Alexandre Xavier Falcao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art (SOTA) object detection methods have succeeded in several
+applications at the price of relying on heavyweight neural networks, which
+makes them inefficient and inviable for many applications with computational
+resource constraints. This work presents a method to build a Convolutional
+Neural Network (CNN) layer by layer for object detection from user-drawn
+markers on discriminative regions of representative images. We address the
+detection of Schistosomiasis mansoni eggs in microscopy images of fecal
+samples, and the detection of ships in satellite images as application
+examples. We could create a flyweight CNN without backpropagation from very few
+input images. Our method explores a recent methodology, Feature Learning from
+Image Markers (FLIM), to build convolutional feature extractors (encoders) from
+marker pixels. We extend FLIM to include a single-layer adaptive decoder, whose
+weights vary with the input image -- a concept never explored in CNNs. Our CNN
+weighs thousands of times less than SOTA object detectors, being suitable for
+CPU execution and showing superior or equivalent performance to three methods
+in five measures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CANet: Channel Extending and Axial Attention Catching Network for
+  Multi-structure Kidney Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05241v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05241v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Bu, Kai-Ni Wang, Guang-Quan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Renal cancer is one of the most prevalent cancers worldwide. Clinical signs
+of kidney cancer include hematuria and low back discomfort, which are quite
+distressing to the patient. Some surgery-based renal cancer treatments like
+laparoscopic partial nephrectomy relys on the 3D kidney parsing on computed
+tomography angiography (CTA) images. Many automatic segmentation techniques
+have been put forward to make multi-structure segmentation of the kidneys more
+accurate. The 3D visual model of kidney anatomy will help clinicians plan
+operations accurately before surgery. However, due to the diversity of the
+internal structure of the kidney and the low grey level of the edge. It is
+still challenging to separate the different parts of the kidney in a clear and
+accurate way. In this paper, we propose a channel extending and axial attention
+catching Network(CANet) for multi-structure kidney segmentation. Our solution
+is founded based on the thriving nn-UNet architecture. Firstly, by extending
+the channel size, we propose a larger network, which can provide a broader
+perspective, facilitating the extraction of complex structural information.
+Secondly, we include an axial attention catching(AAC) module in the decoder,
+which can obtain detailed information for refining the edges. We evaluate our
+CANet on the KiPA2022 dataset, achieving the dice scores of 95.8%, 89.1%, 87.5%
+and 84.9% for kidney, tumor, artery and vein, respectively, which helps us get
+fourth place in the challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KiPA2022 Challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GSDA: Generative Adversarial Network-based Semi-Supervised Data
+  Augmentation for Ultrasound Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.06184v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.06184v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoshan Liu, Qiujie Lv, Chau Hung Lee, Lei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical Ultrasound (US) is one of the most widely used imaging modalities in
+clinical practice, but its usage presents unique challenges such as variable
+imaging quality. Deep Learning (DL) models can serve as advanced medical US
+image analysis tools, but their performance is greatly limited by the scarcity
+of large datasets. To solve the common data shortage, we develop GSDA, a
+Generative Adversarial Network (GAN)-based semi-supervised data augmentation
+method. GSDA consists of the GAN and Convolutional Neural Network (CNN). The
+GAN synthesizes and pseudo-labels high-resolution, high-quality US images, and
+both real and synthesized images are then leveraged to train the CNN. To
+address the training challenges of both GAN and CNN with limited data, we
+employ transfer learning techniques during their training. We also introduce a
+novel evaluation standard that balances classification accuracy with
+computational time. We evaluate our method on the BUSI dataset and GSDA
+outperforms existing state-of-the-art methods. With the high-resolution and
+high-quality images synthesized, GSDA achieves a 97.9% accuracy using merely
+780 images. Given these promising results, we believe that GSDA holds potential
+as an auxiliary tool for medical US analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Heliyon Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SC-DepthV3: Robust <span class="highlight-title">Self-supervised</span> Monocular Depth Estimation for
+  Dynamic Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03660v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03660v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Libo Sun, Jia-Wang Bian, Huangying Zhan, Wei Yin, Ian Reid, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised monocular depth estimation has shown impressive results in
+static scenes. It relies on the multi-view consistency assumption for training
+networks, however, that is violated in dynamic object regions and occlusions.
+Consequently, existing methods show poor accuracy in dynamic scenes, and the
+estimated depth map is blurred at object boundaries because they are usually
+occluded in other training views. In this paper, we propose SC-DepthV3 for
+addressing the challenges. Specifically, we introduce an external pretrained
+monocular depth estimation model for generating single-image depth prior,
+namely pseudo-depth, based on which we propose novel losses to boost
+self-supervised training. As a result, our model can predict sharp and accurate
+depth maps, even when training from monocular videos of highly-dynamic scenes.
+We demonstrate the significantly superior performance of our method over
+previous methods on six challenging datasets, and we provide detailed ablation
+studies for the proposed terms. Source code and data will be released at
+https://github.com/JiawangBian/sc_depth_pl
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in TPAMI; The code will be available at
+  https://github.com/JiawangBian/sc_depth_pl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiViT: Extremely Compressed Binary Vision <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yefei He, Zhenyu Lou, Luoming Zhang, Jing Liu, Weijia Wu, Hong Zhou, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model binarization can significantly compress model size, reduce energy
+consumption, and accelerate inference through efficient bit-wise operations.
+Although binarizing convolutional neural networks have been extensively
+studied, there is little work on exploring binarization of vision Transformers
+which underpin most recent breakthroughs in visual recognition. To this end, we
+propose to solve two fundamental challenges to push the horizon of Binary
+Vision Transformers (BiViT). First, the traditional binary method does not take
+the long-tailed distribution of softmax attention into consideration, bringing
+large binarization errors in the attention module. To solve this, we propose
+Softmax-aware Binarization, which dynamically adapts to the data distribution
+and reduces the error caused by binarization. Second, to better preserve the
+information of the pretrained model and restore accuracy, we propose a
+Cross-layer Binarization scheme that decouples the binarization of
+self-attention and multi-layer perceptrons (MLPs), and Parameterized Weight
+Scales which introduce learnable scaling factors for weight binarization.
+Overall, our method performs favorably against state-of-the-arts by 19.8% on
+the TinyImageNet dataset. On ImageNet, our BiViT achieves a competitive 75.6%
+Top-1 accuracy over Swin-S model. Additionally, on COCO object detection, our
+method achieves an mAP of 40.8 with a Swin-T backbone over Cascade Mask R-CNN
+framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with
+  4D Imaging Radar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10784v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10784v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Liu, Qiuchi Zhao, Weiyi Xiong, Tao Huang, Qing-Long Han, Bing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle
+sensing due to its cost-effectiveness and operability in adverse weather
+conditions. However, the adoption of this technology has been hindered by
+sparsity and noise issues in radar point cloud data. This paper introduces
+spatial multi-representation fusion (SMURF), a novel approach to 3D object
+detection using a single 4D imaging radar. SMURF leverages multiple
+representations of radar detection points, including pillarization and density
+features of a multi-dimensional Gaussian mixture distribution through kernel
+density estimation (KDE). KDE effectively mitigates measurement inaccuracy
+caused by limited angular resolution and multi-path propagation of radar
+signals. Additionally, KDE helps alleviate point cloud sparsity by capturing
+density features. Experimental evaluations on View-of-Delft (VoD) and
+TJ4DRadSet datasets demonstrate the effectiveness and generalization ability of
+SMURF, outperforming recently proposed 4D imaging radar-based
+single-representation models. Moreover, while using 4D imaging radar only,
+SMURF still achieves comparable performance to the state-of-the-art 4D imaging
+radar and camera fusion-based method, with an increase of 1.22% in the mean
+average precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D
+mean average precision on the entire annotated area of VoD dataset. Our
+proposed method demonstrates impressive inference time and addresses the
+challenges of real-time detection, with the inference time no more than 0.05
+seconds for most scans on both datasets. This research highlights the benefits
+of 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D
+object detection with 4D imaging radar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Intelligent Vehicles</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MagicDrive: Street View Generation with Diverse 3D Geometry Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyuan Gao, Kai Chen, Enze Xie, Lanqing Hong, Zhenguo Li, Dit-Yan Yeung, Qiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in diffusion models have significantly enhanced the data
+synthesis with 2D control. Yet, precise 3D control in street view generation,
+crucial for 3D perception tasks, remains elusive. Specifically, utilizing
+Bird's-Eye View (BEV) as the primary condition often leads to challenges in
+geometry control (e.g., height), affecting the representation of object shapes,
+occlusion patterns, and road surface elevations, all of which are essential to
+perception data synthesis, especially for 3D object detection tasks. In this
+paper, we introduce MagicDrive, a novel street view generation framework
+offering diverse 3D geometry controls, including camera poses, road maps, and
+3D bounding boxes, together with textual descriptions, achieved through
+tailored encoding strategies. Besides, our design incorporates a cross-view
+attention module, ensuring consistency across multiple camera views. With
+MagicDrive, we achieve high-fidelity street-view synthesis that captures
+nuanced 3D geometry and various scene descriptions, enhancing tasks like BEV
+segmentation and 3D object detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://flymin.github.io/magicdrive</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Representations on the Unit Sphere: Investigating Angular
+  Gaussian and von Mises-Fisher Distributions for Online Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03364v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03364v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Michel, Giovanni Chierchia, Romain Negrel, Jean-François Bercher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use the maximum a posteriori estimation principle for learning
+representations distributed on the unit sphere. We propose to use the angular
+Gaussian distribution, which corresponds to a Gaussian projected on the
+unit-sphere and derive the associated loss function. We also consider the von
+Mises-Fisher distribution, which is the conditional of a Gaussian in the
+unit-sphere. The learned representations are pushed toward fixed directions,
+which are the prior means of the Gaussians; allowing for a learning strategy
+that is resilient to data drift. This makes it suitable for online continual
+learning, which is the problem of training neural networks on a continuous data
+stream, where multiple classification tasks are presented sequentially so that
+data from past tasks are no longer accessible, and data from the current task
+can be seen only once. To address this challenging scenario, we propose a
+memory-based representation learning technique equipped with our new loss
+functions. Our approach does not require negative data or knowledge of task
+boundaries and performs well with smaller batch sizes while being
+computationally efficient. We demonstrate with extensive experiments that the
+proposed method outperforms the current state-of-the-art methods on both
+standard evaluation scenarios and realistic scenarios with blurry task
+boundaries. For reproducibility, we use the same training pipeline for every
+compared method and share the code at https://t.ly/SQTj.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, under review, update title</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GOAL: A Challenging Knowledge-grounded Video Captioning Benchmark for
+  Real-time Soccer Commentary Generation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Qi, Jifan Yu, Teng Tu, Kunyu Gao, Yifan Xu, Xinyu Guan, Xiaozhi Wang, Yuxiao Dong, Bin Xu, Lei Hou, Juanzi Li, Jie Tang, Weidong Guo, Hui Liu, Yu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent emergence of video captioning models, how to generate
+vivid, fine-grained video descriptions based on the background knowledge (i.e.,
+long and informative commentary about the domain-specific scenes with
+appropriate reasoning) is still far from being solved, which however has great
+applications such as automatic sports narrative. In this paper, we present
+GOAL, a benchmark of over 8.9k soccer video clips, 22k sentences, and 42k
+knowledge triples for proposing a challenging new task setting as
+Knowledge-grounded Video Captioning (KGVC). Moreover, we conduct experimental
+adaption of existing methods to show the difficulty and potential directions
+for solving this valuable and applicable task. Our data and code are available
+at https://github.com/THU-KEG/goal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust 3D Object Detection In Rainy Conditions <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aldi Piroli, Vinzenz Dallabetta, Johannes Kopp, Marc Walessa, Daniel Meissner, Klaus Dietmayer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR sensors are used in autonomous driving applications to accurately
+perceive the environment. However, they are affected by adverse weather
+conditions such as snow, fog, and rain. These everyday phenomena introduce
+unwanted noise into the measurements, severely degrading the performance of
+LiDAR-based perception systems. In this work, we propose a framework for
+improving the robustness of LiDAR-based 3D object detectors against road spray.
+Our approach uses a state-of-the-art adverse weather detection network to
+filter out spray from the LiDAR point cloud, which is then used as input for
+the object detector. In this way, the detected objects are less affected by the
+adverse weather in the scene, resulting in a more accurate perception of the
+environment. In addition to adverse weather filtering, we explore the use of
+radar targets to further filter false positive detections. Tests on real-world
+data show that our approach improves the robustness to road spray of several
+popular 3D object detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE International Conference on Intelligent
+  Transportation Systems ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COOLer: Class-Incremental Learning for Appearance-Based Multiple Object
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhizheng Liu, Mattia Segu, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning allows a model to learn multiple tasks sequentially while
+retaining the old knowledge without the training data of the preceding tasks.
+This paper extends the scope of continual learning research to
+class-incremental learning for multiple object tracking (MOT), which is
+desirable to accommodate the continuously evolving needs of autonomous systems.
+Previous solutions for continual learning of object detectors do not address
+the data association stage of appearance-based trackers, leading to
+catastrophic forgetting of previous classes' re-identification features. We
+introduce COOLer, a COntrastive- and cOntinual-Learning-based tracker, which
+incrementally learns to track new categories while preserving past knowledge by
+training on a combination of currently available ground truth labels and
+pseudo-labels generated by the past tracker. To further exacerbate the
+disentanglement of instance representations, we introduce a novel contrastive
+class-incremental instance representation learning technique. Finally, we
+propose a practical evaluation protocol for continual learning for MOT and
+conduct experiments on the BDD100K and SHIFT datasets. Experimental results
+demonstrate that COOLer continually learns while effectively addressing
+catastrophic forgetting of both tracking and detection. The code is available
+at https://github.com/BoSmallEar/COOLer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GCPR 2023 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15357v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15357v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Ma, Huan Yang, Wenhan Yang, Jianlong Fu, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models, as a kind of powerful generative model, have given
+impressive results on image super-resolution (SR) tasks. However, due to the
+randomness introduced in the reverse process of diffusion models, the
+performances of diffusion-based SR models are fluctuating at every time of
+sampling, especially for samplers with few resampled steps. This inherent
+randomness of diffusion models results in ineffectiveness and instability,
+making it challenging for users to guarantee the quality of SR results.
+However, our work takes this randomness as an opportunity: fully analyzing and
+leveraging it leads to the construction of an effective plug-and-play sampling
+method that owns the potential to benefit a series of diffusion-based SR
+methods. More in detail, we propose to steadily sample high-quality SR images
+from pre-trained diffusion-based SR models by solving diffusion ordinary
+differential equations (diffusion ODEs) with optimal boundary conditions (BCs)
+and analyze the characteristics between the choices of BCs and their
+corresponding SR results. Our analysis shows the route to obtain an
+approximately optimal BC via an efficient exploration in the whole space. The
+quality of SR results sampled by the proposed method with fewer steps
+outperforms the quality of results sampled by current methods with randomness
+from the same pre-trained diffusion-based SR model, which means that our
+sampling method "boosts" current diffusion-based SR models without any
+additional training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unpaired Image-to-Image Translation via Neural Schrödinger Bridge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beomsu Kim, Gihyun Kwon, Kwanyoung Kim, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are a powerful class of generative models which simulate
+stochastic differential equations (SDEs) to generate data from noise. Although
+diffusion models have achieved remarkable progress in recent years, they have
+limitations in the unpaired image-to-image translation tasks due to the
+Gaussian prior assumption. Schr\"odinger Bridge (SB), which learns an SDE to
+translate between two arbitrary distributions, have risen as an attractive
+solution to this problem. However, none of SB models so far have been
+successful at unpaired translation between high-resolution images. In this
+work, we propose the Unpaired Neural Schr\"odinger Bridge (UNSB), which
+expresses SB problem as a sequence of adversarial learning problems. This
+allows us to incorporate advanced discriminators and regularization to learn a
+SB between unpaired data. We demonstrate that UNSB is scalable and successfully
+solves various unpaired image-to-image translation tasks. Code:
+\url{https://github.com/cyclomon/UNSB}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private GANs, Revisited 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Bie, Gautam Kamath, Guojun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the canonical approach for training differentially private GANs
+-- updating the discriminator with differentially private stochastic gradient
+descent (DPSGD) -- can yield significantly improved results after modifications
+to training. Specifically, we propose that existing instantiations of this
+approach neglect to consider how adding noise only to discriminator updates
+inhibits discriminator training, disrupting the balance between the generator
+and discriminator necessary for successful GAN training. We show that a simple
+fix -- taking more discriminator steps between generator steps -- restores
+parity between the generator and discriminator and improves results.
+  Additionally, with the goal of restoring parity, we experiment with other
+modifications -- namely, large batch sizes and adaptive discriminator update
+frequency -- to improve discriminator training and see further improvements in
+generation quality. Our results demonstrate that on standard image synthesis
+benchmarks, DPSGD outperforms all alternative GAN privatization schemes. Code:
+https://github.com/alexbie98/dpgan-revisit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages; revisions and new experiments from TMLR camera-ready + code
+  release at https://github.com/alexbie98/dpgan-revisit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PIE: Simulating Disease Progression via Progressive Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaizhao Liang, Xu Cao, Kuei-Da Liao, Tianren Gao, Wenqian Ye, Zhengyu Chen, Jianguo Cao, Tejas Nama, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disease progression simulation is a crucial area of research that has
+significant implications for clinical diagnosis, prognosis, and treatment. One
+major challenge in this field is the lack of continuous medical imaging
+monitoring of individual patients over time. To address this issue, we develop
+a novel framework termed Progressive Image Editing (PIE) that enables
+controlled manipulation of disease-related image features, facilitating precise
+and realistic disease progression simulation. Specifically, we leverage recent
+advancements in text-to-image generative models to simulate disease progression
+accurately and personalize it for each patient. We theoretically analyze the
+iterative refining process in our framework as a gradient descent with an
+exponentially decayed learning rate. To validate our framework, we conduct
+experiments in three medical imaging domains. Our results demonstrate the
+superiority of PIE over existing methods such as Stable Diffusion Walk and
+Style-Based Manifold Extrapolation based on CLIP score (Realism) and Disease
+Classification Confidence (Alignment). Our user study collected feedback from
+35 veteran physicians to assess the generated progressions. Remarkably, 76.2%
+of the feedback agrees with the fidelity of the generated progressions. To our
+best knowledge, PIE is the first of its kind to generate disease progression
+images meeting real-world standards. It is a promising tool for medical
+research and clinical practice, potentially allowing healthcare providers to
+model disease trajectories over time, predict future treatment responses, and
+improve patient outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and checkpoints for replicating our results can be found at
+  https://github.com/IrohXu/PIE and
+  https://huggingface.co/IrohXu/stable-diffusion-mimic-cxr-v0.1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Role of Language Priors in Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01879v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01879v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiu Lin, Xinyue Chen, Deepak Pathak, Pengchuan Zhang, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) are impactful in part because they can be
+applied to a variety of visual understanding tasks in a zero-shot fashion,
+without any fine-tuning. We study $\textit{generative VLMs}$ that are trained
+for next-word generation given an image. We explore their zero-shot performance
+on the illustrative task of image-text retrieval across 8 popular
+vision-language benchmarks. Our first observation is that they can be
+repurposed for discriminative tasks (such as image-text retrieval) by simply
+computing the match score of generating a particular text string given an
+image. We call this probabilistic score the $\textit{Visual Generative
+Pre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces
+near-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on
+others. We analyze this behavior through a probabilistic lens, pointing out
+that some benchmarks inadvertently capture unnatural language distributions by
+creating adversarial but unlikely text captions. In fact, we demonstrate that
+even a "blind" language model that ignores any image evidence can sometimes
+outperform all prior art, reminiscent of similar challenges faced by the
+visual-question answering (VQA) community many years ago. We derive a
+probabilistic post-processing scheme that controls for the amount of linguistic
+bias in generative VLMs at test time without having to retrain or fine-tune the
+model. We show that the VisualGPTScore, when appropriately debiased, is a
+strong zero-shot baseline for vision-language understanding, oftentimes
+producing state-of-the-art accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://linzhiqiu.github.io/papers/visual_gpt_score/ Code:
+  https://github.com/linzhiqiu/visual_gpt_score/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.14883v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.14883v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenggui Li, Hongxin Liu, Zhengda Bian, Jiarui Fang, Haichen Huang, Yuliang Liu, Boxiang Wang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of Transformer models has pushed the deep learning model scale to
+billions of parameters. Due to the limited memory resource of a single GPU,
+However, the best practice for choosing the optimal parallel strategy is still
+lacking, since it requires domain expertise in both deep learning and parallel
+computing.
+  The Colossal-AI system addressed the above challenge by introducing a unified
+interface to scale your sequential code of model training to distributed
+environments. It supports parallel training methods such as data, pipeline,
+tensor, and sequence parallelism, as well as heterogeneous training methods
+integrated with zero redundancy optimizer. Compared to the baseline system,
+Colossal-AI can achieve up to 2.76 times training speedup on large-scale
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12439v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12439v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tinghao Xie, Xiangyu Qi, Ping He, Yiming Li, Jiachen T. Wang, Prateek Mittal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel defense, against backdoor attacks on Deep Neural Networks
+(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors)
+into DNNs. Our defense falls within the category of post-development defenses
+that operate independently of how the model was generated. The proposed defense
+is built upon a novel reverse engineering approach that can directly extract
+backdoor functionality of a given backdoored model to a backdoor expert model.
+The approach is straightforward -- finetuning the backdoored model over a small
+set of intentionally mislabeled clean samples, such that it unlearns the normal
+functionality while still preserving the backdoor functionality, and thus
+resulting in a model (dubbed a backdoor expert model) that can only recognize
+backdoor inputs. Based on the extracted backdoor expert model, we show the
+feasibility of devising highly accurate backdoor input detectors that filter
+out the backdoor inputs during model inference. Further augmented by an
+ensemble strategy with a finetuned auxiliary model, our defense, BaDExpert
+(Backdoor Input Detection with Backdoor Expert), effectively mitigates 17 SOTA
+backdoor attacks while minimally impacting clean utility. The effectiveness of
+BaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet)
+across various model architectures (ResNet, VGG, MobileNetV2 and Vision
+Transformer).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>-MIL: Boosting Multi-Instance Learning Schemes via Task-specific
+  <span class="highlight-title">Prompt</span> Tuning <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12214v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12214v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Zhang, Saarthak Kapse, Ke Ma, Prateek Prasanna, Joel Saltz, Maria Vakalopoulou, Dimitris Samaras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole slide image (WSI) classification is a critical task in computational
+pathology, requiring the processing of gigapixel-sized images, which is
+challenging for current deep-learning methods. Current state of the art methods
+are based on multi-instance learning schemes (MIL), which usually rely on
+pretrained features to represent the instances. Due to the lack of
+task-specific annotated data, these features are either obtained from
+well-established backbones on natural images, or, more recently from
+self-supervised models pretrained on histopathology. However, both approaches
+yield task-agnostic features, resulting in performance loss compared to the
+appropriate task-related supervision, if available. In this paper, we show that
+when task-specific annotations are limited, we can inject such supervision into
+downstream task training, to reduce the gap between fully task-tuned and task
+agnostic features. We propose Prompt-MIL, an MIL framework that integrates
+prompts into WSI classification. Prompt-MIL adopts a prompt tuning mechanism,
+where only a small fraction of parameters calibrates the pretrained features to
+encode task-specific information, rather than the conventional full fine-tuning
+approaches. Extensive experiments on three WSI datasets, TCGA-BRCA, TCGA-CRC,
+and BRIGHT, demonstrate the superiority of Prompt-MIL over conventional MIL
+methods, achieving a relative improvement of 1.49%-4.03% in accuracy and
+0.25%-8.97% in AUROC while using fewer than 0.3% additional parameters.
+Compared to conventional full fine-tuning approaches, we fine-tune less than
+1.3% of the parameters, yet achieve a relative improvement of 1.29%-13.61% in
+accuracy and 3.22%-27.18% in AUROC and reduce GPU memory consumption by 38%-45%
+while training 21%-27% faster. Our code is available at
+https://github.com/cvlab-stonybrook/PromptMIL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PostRainBench: A comprehensive benchmark and a new model for
+  precipitation forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujin Tang, Jiaming Zhou, Xiang Pan, Zeying Gong, Junwei Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate precipitation forecasting is a vital challenge of both scientific
+and societal importance. Data-driven approaches have emerged as a widely used
+solution for addressing this challenge. However, solely relying on data-driven
+approaches has limitations in modeling the underlying physics, making accurate
+predictions difficult. Coupling AI-based post-processing techniques with
+traditional Numerical Weather Prediction (NWP) methods offers a more effective
+solution for improving forecasting accuracy. Despite previous post-processing
+efforts, accurately predicting heavy rainfall remains challenging due to the
+imbalanced precipitation data across locations and complex relationships
+between multiple meteorological variables. To address these limitations, we
+introduce the PostRainBench, a comprehensive multi-variable NWP post-processing
+benchmark consisting of three datasets for NWP post-processing-based
+precipitation forecasting. We propose CAMT, a simple yet effective Channel
+Attention Enhanced Multi-task Learning framework with a specially designed
+weighted loss function. Its flexible design allows for easy plug-and-play
+integration with various backbones. Extensive experimental results on the
+proposed benchmark show that our method outperforms state-of-the-art methods by
+6.3%, 4.7%, and 26.8% in rain CSI on the three datasets respectively. Most
+notably, our model is the first deep learning-based method to outperform
+traditional Numerical Weather Prediction (NWP) approaches in extreme
+precipitation conditions. It shows improvements of 15.6%, 17.4%, and 31.8% over
+NWP predictions in heavy rain CSI on respective datasets. These results
+highlight the potential impact of our model in reducing the severe consequences
+of extreme weather events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SFUSNet: A Spatial-Frequency domain-based Multi-branch Network for
+  diagnosis of Cervical Lymph Node Lesions in Ultrasound Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubiao Yue, Jun Xue, Haihua Liang, Bingchun Luo, Zhenzhang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Booming deep learning has substantially improved the diagnosis for diverse
+lesions in ultrasound images, but a conspicuous research gap concerning
+cervical lymph node lesions still remains. The objective of this work is to
+diagnose cervical lymph node lesions in ultrasound images by leveraging a deep
+learning model. To this end, we first collected 3392 cervical ultrasound images
+containing normal lymph nodes, benign lymph node lesions, malignant primary
+lymph node lesions, and malignant metastatic lymph node lesions. Given that
+ultrasound images are generated by the reflection and scattering of sound waves
+across varied bodily tissues, we proposed the Conv-FFT Block. It integrates
+convolutional operations with the fast Fourier transform to more astutely model
+the images. Building upon this foundation, we designed a novel architecture,
+named SFUSNet. SFUSNet not only discerns variances in ultrasound images from
+the spatial domain but also adeptly captures micro-structural alterations
+across various lesions in the frequency domain. To ascertain the potential of
+SFUSNet, we benchmarked it against 12 popular architectures through five-fold
+cross-validation. The results show that SFUSNet is the state-of-the-art model
+and can achieve 92.89% accuracy. Moreover, its average precision, average
+sensitivity and average specificity for four types of lesions achieve 90.46%,
+89.95% and 97.49%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstructing Existing Levels through Level Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09472v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09472v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johor Jara Gonzalez, Matthew Guzdial
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural Content Generation (PCG) and Procedural Content Generation via
+Machine Learning (PCGML) have been used in prior work for generating levels in
+various games. This paper introduces Content Augmentation and focuses on the
+subproblem of level inpainting, which involves reconstructing and extending
+video game levels. Drawing inspiration from image inpainting, we adapt two
+techniques from this domain to address our specific use case. We present two
+approaches for level inpainting: an Autoencoder and a U-net. Through a
+comprehensive case study, we demonstrate their superior performance compared to
+a baseline method and discuss their relative merits. Furthermore, we provide a
+practical demonstration of both approaches for the level inpainting task and
+offer insights into potential directions for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, Artificial Intelligence and Interactive Digital
+  Entertainment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Deep Unrolled Reconstruction Using Regularization by
+  Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.03519v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.03519v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peizhou Huang, Chaoyi Zhang, Xiaoliang Zhang, Xiaojuan Li, Liang Dong, Leslie Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods have been successfully used in various computer vision
+tasks. Inspired by that success, deep learning has been explored in magnetic
+resonance imaging (MRI) reconstruction. In particular, integrating deep
+learning and model-based optimization methods has shown considerable
+advantages. However, a large amount of labeled training data is typically
+needed for high reconstruction quality, which is challenging for some MRI
+applications. In this paper, we propose a novel reconstruction method, named
+DURED-Net, that enables interpretable self-supervised learning for MR image
+reconstruction by combining a self-supervised denoising network and a
+plug-and-play method. We aim to boost the reconstruction performance of
+Noise2Noise in MR reconstruction by adding an explicit prior that utilizes
+imaging physics. Specifically, the leverage of a denoising network for MRI
+reconstruction is achieved using Regularization by Denoising (RED). Experiment
+results demonstrate that the proposed method requires a reduced amount of
+training data to achieve high reconstruction quality among the state-of-art of
+MR reconstruction utilizing the Noise2Noise method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSPy: Compiling Declarative Language Model Calls into Self-Improving
+  Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T. Joshi, Hanna Moazam, Heather Miller, Matei Zaharia, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ML community is rapidly exploring techniques for prompting language
+models (LMs) and for stacking them into pipelines that solve complex tasks.
+Unfortunately, existing LM pipelines are typically implemented using hard-coded
+"prompt templates", i.e. lengthy strings discovered via trial and error. Toward
+a more systematic approach for developing and optimizing LM pipelines, we
+introduce DSPy, a programming model that abstracts LM pipelines as text
+transformation graphs, i.e. imperative computational graphs where LMs are
+invoked through declarative modules. DSPy modules are parameterized, meaning
+they can learn (by creating and collecting demonstrations) how to apply
+compositions of prompting, finetuning, augmentation, and reasoning techniques.
+We design a compiler that will optimize any DSPy pipeline to maximize a given
+metric. We conduct two case studies, showing that succinct DSPy programs can
+express and optimize sophisticated LM pipelines that reason about math word
+problems, tackle multi-hop retrieval, answer complex questions, and control
+agent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and
+llama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot
+prompting (generally by over 25% and 65%, respectively) and pipelines with
+expert-created demonstrations (by up to 5-46% and 16-40%, respectively). On top
+of that, DSPy programs compiled to open and relatively small LMs like
+770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely
+on expert-written prompt chains for proprietary GPT-3.5. DSPy is available at
+https://github.com/stanfordnlp/dspy
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FASER: Binary Code Similarity Search through the use of Intermediate
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josh Collyer, Tim Watson, Iain Phillips
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to identify functions of interest in cross-architecture software
+is useful whether you are analysing for malware, securing the software supply
+chain or conducting vulnerability research. Cross-Architecture Binary Code
+Similarity Search has been explored in numerous studies and has used a wide
+range of different data sources to achieve its goals. The data sources
+typically used draw on common structures derived from binaries such as function
+control flow graphs or binary level call graphs, the output of the disassembly
+process or the outputs of a dynamic analysis approach. One data source which
+has received less attention is binary intermediate representations. Binary
+Intermediate representations possess two interesting properties: they are cross
+architecture by their very nature and encode the semantics of a function
+explicitly to support downstream usage. Within this paper we propose Function
+as a String Encoded Representation (FASER) which combines long document
+transformers with the use of intermediate representations to create a model
+capable of cross architecture function search without the need for manual
+feature engineering, pre-training or a dynamic analysis step. We compare our
+approach against a series of baseline approaches for two tasks; A general
+function search task and a targeted vulnerability search task. Our approach
+demonstrates strong performance across both tasks, performing better than all
+baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, To be presented as Conference on Applied Machine Learning
+  for Information Security</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TPDR: A Novel Two-Step <span class="highlight-title">Transformer</span>-based Product and Class Description
+  Match and Retrieval Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Washington Cunha, Celso França, Leonardo Rocha, Marcos André Gonçalves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a niche of companies responsible for intermediating the purchase of
+large batches of varied products for other companies, for which the main
+challenge is to perform product description standardization, i.e., matching an
+item described by a client with a product described in a catalog. The problem
+is complex since the client's product description may be: (1) potentially
+noisy; (2) short and uninformative (e.g., missing information about model and
+size); and (3) cross-language. In this paper, we formalize this problem as a
+ranking task: given an initial client product specification (query), return the
+most appropriate standardized descriptions (response). In this paper, we
+propose TPDR, a two-step Transformer-based Product and Class Description
+Retrieval method that is able to explore the semantic correspondence between IS
+and SD, by exploiting attention mechanisms and contrastive learning. First,
+TPDR employs the transformers as two encoders sharing the embedding vector
+space: one for encoding the IS and another for the SD, in which corresponding
+pairs (IS, SD) must be close in the vector space. Closeness is further enforced
+by a contrastive learning mechanism leveraging a specialized loss function.
+TPDR also exploits a (second) re-ranking step based on syntactic features that
+are very important for the exact matching (model, dimension) of certain
+products that may have been neglected by the transformers. To evaluate our
+proposal, we consider 11 datasets from a real company, covering different
+application contexts. Our solution was able to retrieve the correct
+standardized product before the 5th ranking position in 71% of the cases and
+its correct category in the first position in 80% of the situations. Moreover,
+the effectiveness gains over purely syntactic or semantic baselines reach up to
+3.7 times, solving cases that none of the approaches in isolation can do by
+themselves.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized <span class="highlight-title">Transformer</span>-based Ranking for e-Commerce at Yandex 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirill Khrylchenko, Alexander Fritzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalizing the user experience with high-quality recommendations based on
+user activities is vital for e-commerce platforms. This is particularly
+important in scenarios where the user's intent is not explicit, such as on the
+homepage. Recently, personalized embedding-based systems have significantly
+improved the quality of recommendations and search results in the e-commerce
+domain. However, most of these works focus on enhancing the retrieval stage.
+  In this paper, we demonstrate that features produced by retrieval-focused
+deep learning models are sub-optimal for ranking stage in e-commerce
+recommendations. To address this issue, we propose a two-stage training process
+that fine-tunes two-tower models to achieve optimal ranking performance. We
+provide a detailed description of our transformer-based two-tower model
+architecture, which is specifically designed for personalization in e-commerce.
+  Additionally, we introduce a novel technique for debiasing context in offline
+models and report significant improvements in ranking performance when using
+web-search queries for e-commerce recommendations. Our model has been
+successfully deployed at Yandex and has delivered strong performance in online
+A/B testing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SE-PEF: a Resource for Personalized Expert Finding <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11686v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11686v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Kasela, Gabriella Pasi, Raffaele Perego
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of personalization in Information Retrieval has been under study
+for a long time. A well-known issue related to this task is the lack of
+publicly available datasets that can support a comparative evaluation of
+personalized search systems. To contribute in this respect, this paper
+introduces SE-PEF (StackExchange - Personalized Expert Finding), a resource
+useful for designing and evaluating personalized models related to the task of
+Expert Finding (EF). The contributed dataset includes more than 250k queries
+and 565k answers from 3 306 experts, which are annotated with a rich set of
+features modeling the social interactions among the users of a popular cQA
+platform. The results of the preliminary experiments conducted show the
+appropriateness of SE-PEF to evaluate and to train effective EF models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR-AP '23 Conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Social Choice Mechanisms for Recommendation Fairness in SCRUF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amanda Aird, Cassidy All, Paresha Farastu, Elena Stefancova, Joshua Sun, Nicholas Mattei, Robin Burke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness problems in recommender systems often have a complexity in practice
+that is not adequately captured in simplified research formulations. A social
+choice formulation of the fairness problem, operating within a multi-agent
+architecture of fairness concerns, offers a flexible and multi-aspect
+alternative to fairness-aware recommendation approaches. Leveraging social
+choice allows for increased generality and the possibility of tapping into
+well-studied social choice algorithms for resolving the tension between
+multiple, competing fairness concerns. This paper explores a range of options
+for choice mechanisms in multi-aspect fairness applications using both real and
+synthetic data and shows that different classes of choice and allocation
+mechanisms yield different but consistent fairness / accuracy tradeoffs. We
+also show that a multi-agent formulation offers flexibility in adapting to user
+population dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SR-HetGNN:Session-based Recommendation with Heterogeneous Graph Neural
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.05641v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.05641v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinpeng Chen, Haiyang Li, Xudong Zhang, Fan Zhang, Senzhang Wang, Kaimin Wei, Jiaqi Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Session-Based Recommendation System aims to predict the user's next click
+based on their previous session sequence. The current studies generally learn
+user preferences according to the transitions of items in the user's session
+sequence. However, other effective information in the session sequence, such as
+user profiles, are largely ignored which may lead to the model unable to learn
+the user's specific preferences. In this paper, we propose SR-HetGNN, a novel
+session recommendation method that uses a heterogeneous graph neural network
+(HetGNN) to learn session embeddings and capture the specific preferences of
+anonymous users. Specifically, SR-HetGNN first constructs heterogeneous graphs
+containing various types of nodes according to the session sequence, which can
+capture the dependencies among items, users, and sessions. Second, HetGNN
+captures the complex transitions between items and learns the item embeddings
+containing user information. Finally, local and global session embeddings are
+combined with the attentional network to obtain the final session embedding,
+considering the influence of users' long and short-term preferences. SR-HetGNN
+is shown to be superior to the existing state-of-the-art session-based
+recommendation methods through extensive experiments over two real large
+datasets Diginetica and Tmall.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpaDE: Improving Sparse Representations using a Dual Document Encoder
+  for First-stage Retrieval <span class="chip">CIKM '22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05917v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05917v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eunseong Choi, Sunkyung Lee, Minjin Choi, Hyeseon Ko, Young-In Song, Jongwuk Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse document representations have been widely used to retrieve relevant
+documents via exact lexical matching. Owing to the pre-computed inverted index,
+it supports fast ad-hoc search but incurs the vocabulary mismatch problem.
+Although recent neural ranking models using pre-trained language models can
+address this problem, they usually require expensive query inference costs,
+implying the trade-off between effectiveness and efficiency. Tackling the
+trade-off, we propose a novel uni-encoder ranking model, Sparse retriever using
+a Dual document Encoder (SpaDE), learning document representation via the dual
+encoder. Each encoder plays a central role in (i) adjusting the importance of
+terms to improve lexical matching and (ii) expanding additional terms to
+support semantic matching. Furthermore, our co-training strategy trains the
+dual encoder effectively and avoids unnecessary intervention in training each
+other. Experimental results on several benchmarks show that SpaDE outperforms
+existing uni-encoder ranking models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 31st ACM International Conference on
+  Information and Knowledge Management (CIKM '22). 13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Baselines with Visual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Liu, Chunyuan Li, Yuheng Li, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal models (LMM) have recently shown encouraging progress with
+visual instruction tuning. In this note, we show that the fully-connected
+vision-language cross-modal connector in LLaVA is surprisingly powerful and
+data-efficient. With simple modifications to LLaVA, namely, using
+CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA
+data with simple response formatting prompts, we establish stronger baselines
+that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint
+uses merely 1.2M publicly available data, and finishes full training in ~1 day
+on a single 8-A100 node. We hope this can make state-of-the-art LMM research
+more accessible. Code and model will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech report, 4 pages. LLaVA project page: https://llava-vl.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Un-Kidnappable Robot: Acoustic Localization of Sneaking People 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengyu Yang, Patrick Grady, Samarth Brahmbhatt, Arun Balajee Vasudevan, Charles C. Kemp, James Hays
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How easy is it to sneak up on a robot? We examine whether we can detect
+people using only the incidental sounds they produce as they move, even when
+they try to be quiet. We collect a robotic dataset of high-quality 4-channel
+audio paired with 360 degree RGB data of people moving in different indoor
+settings. We train models that predict if there is a moving person nearby and
+their location using only audio. We implement our method on a robot, allowing
+it to track a single person moving quietly with only passive audio sensing. For
+demonstration videos, see our project page:
+https://sites.google.com/view/unkidnappable-robot
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sites.google.com/view/unkidnappable-robot</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ContactGen: Generative Contact Modeling for Grasp Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaowei Liu, Yang Zhou, Jimei Yang, Saurabh Gupta, Shenlong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel object-centric contact representation ContactGen
+for hand-object interaction. The ContactGen comprises three components: a
+contact map indicates the contact location, a part map represents the contact
+hand part, and a direction map tells the contact direction within each part.
+Given an input object, we propose a conditional generative model to predict
+ContactGen and adopt model-based optimization to predict diverse and
+geometrically feasible grasps. Experimental results demonstrate our method can
+generate high-fidelity and diverse human grasps for various objects. Project
+page: https://stevenlsw.github.io/contactgen/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Website:
+  https://stevenlsw.github.io/contactgen/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Text-to-Image Diffusion Models with Reward Backpropagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihir Prabhudesai, Anirudh Goyal, Deepak Pathak, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models have recently emerged at the forefront of
+image generation, powered by very large-scale unsupervised or weakly supervised
+text-to-image training datasets. Due to their unsupervised training,
+controlling their behavior in downstream tasks, such as maximizing
+human-perceived image quality, image-text alignment, or ethical image
+generation, is difficult. Recent works finetune diffusion models to downstream
+reward functions using vanilla reinforcement learning, notorious for the high
+variance of the gradient estimators. In this paper, we propose AlignProp, a
+method that aligns diffusion models to downstream reward functions using
+end-to-end backpropagation of the reward gradient through the denoising
+process. While naive implementation of such backpropagation would require
+prohibitive memory resources for storing the partial derivatives of modern
+text-to-image models, AlignProp finetunes low-rank adapter weight modules and
+uses gradient checkpointing, to render its memory usage viable. We test
+AlignProp in finetuning diffusion models to various objectives, such as
+image-text semantic alignment, aesthetics, compressibility and controllability
+of the number of objects present, as well as their combinations. We show
+AlignProp achieves higher rewards in fewer training steps than alternatives,
+while being conceptually simpler, making it a straightforward choice for
+optimizing diffusion models for differentiable reward functions of interest.
+Code and Visualization results are available at https://align-prop.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://align-prop.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stylist: Style-Driven Feature Ranking for Robust Novelty Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Smeu, Elena Burceanu, Emanuela Haller, Andrei Liviu Nicolicioiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novelty detection aims at finding samples that differ in some form from the
+distribution of seen samples. But not all changes are created equal. Data can
+suffer a multitude of distribution shifts, and we might want to detect only
+some types of relevant changes. Similar to works in out-of-distribution
+generalization, we propose to use the formalization of separating into semantic
+or content changes, that are relevant to our task, and style changes, that are
+irrelevant. Within this formalization, we define the robust novelty detection
+as the task of finding semantic changes while being robust to style
+distributional shifts. Leveraging pretrained, large-scale model
+representations, we introduce Stylist, a novel method that focuses on dropping
+environment-biased features. First, we compute a per-feature score based on the
+feature distribution distances between environments. Next, we show that our
+selection manages to remove features responsible for spurious correlations and
+improve novelty detection performance. For evaluation, we adapt domain
+generalization datasets to our task and analyze the methods behaviors. We
+additionally built a large synthetic dataset where we have control over the
+spurious correlations degree. We prove that our selection mechanism improves
+novelty detection algorithms across multiple datasets, containing both
+stylistic and content shifts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathCoder: Seamless Code Integration in LLMs for Enhanced Mathematical
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Wang, Houxing Ren, Aojun Zhou, Zimu Lu, Sichun Luo, Weikang Shi, Renrui Zhang, Linqi Song, Mingjie Zhan, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently released GPT-4 Code Interpreter has demonstrated remarkable
+proficiency in solving challenging math problems, primarily attributed to its
+ability to seamlessly reason with natural language, generate code, execute
+code, and continue reasoning based on the execution output. In this paper, we
+present a method to fine-tune open-source language models, enabling them to use
+code for modeling and deriving math equations and, consequently, enhancing
+their mathematical reasoning abilities. We propose a method of generating novel
+and high-quality datasets with math problems and their code-based solutions,
+referred to as MathCodeInstruct. Each solution interleaves natural language,
+code, and execution results. We also introduce a customized supervised
+fine-tuning and inference approach. This approach yields the MathCoder models,
+a family of models capable of generating code-based solutions for solving
+challenging math problems. Impressively, the MathCoder models achieve
+state-of-the-art scores among open-source LLMs on the MATH (45.2%) and GSM8K
+(83.9%) datasets, substantially outperforming other open-source alternatives.
+Notably, the MathCoder model not only surpasses ChatGPT-3.5 and PaLM-2 on GSM8K
+and MATH but also outperforms GPT-4 on the competition-level MATH dataset. The
+dataset and models will be released at https://github.com/mathllm/MathCoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The state-of-the-art open-source language models for mathematical
+  reasoning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic interpolants with data-dependent couplings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael S. Albergo, Mark Goldstein, Nicholas M. Boffi, Rajesh Ranganath, Eric Vanden-Eijnden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models inspired by dynamical transport of measure -- such as flows
+and diffusions -- construct a continuous-time map between two probability
+densities. Conventionally, one of these is the target density, only accessible
+through samples, while the other is taken as a simple base density that is
+data-agnostic. In this work, using the framework of stochastic interpolants, we
+formalize how to \textit{couple} the base and the target densities. This
+enables us to incorporate information about class labels or continuous
+embeddings to construct dynamical transport maps that serve as conditional
+generative models. We show that these transport maps can be learned by solving
+a simple square loss regression problem analogous to the standard independent
+setting. We demonstrate the usefulness of constructing dependent couplings in
+practice through experiments in super-resolution and in-painting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anytime-valid t-tests and confidence sequences for Gaussian means with
+  unknown variance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjian Wang, Aaditya Ramdas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 1976, Lai constructed a nontrivial confidence sequence for the mean $\mu$
+of a Gaussian distribution with unknown variance $\sigma$. Curiously, he
+employed both an improper (right Haar) mixture over $\sigma$ and an improper
+(flat) mixture over $\mu$. Here, we elaborate carefully on the details of his
+construction, which use generalized nonintegrable martingales and an extended
+Ville's inequality. While this does yield a sequential t-test, it does not
+yield an ``e-process'' (due to the nonintegrability of his martingale). In this
+paper, we develop two new e-processes and confidence sequences for the same
+setting: one is a test martingale in a reduced filtration, while the other is
+an e-process in the canonical data filtration. These are respectively obtained
+by swapping Lai's flat mixture for a Gaussian mixture, and swapping the right
+Haar mixture over $\sigma$ with the maximum likelihood estimate under the null,
+as done in universal inference. We also analyze the width of resulting
+confidence sequences, which have a curious dependence on the error probability
+$\alpha$. Numerical experiments are provided along the way to compare and
+contrast the various approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeaP: Hierarchical Policies for Web Actions using LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paloma Sodhi, S. R. K. Branavan, Ryan McDonald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable capabilities in
+performing a range of instruction following tasks in few and zero-shot
+settings. However, teaching LLMs to perform tasks on the web presents
+fundamental challenges -- combinatorially large open-world tasks and variations
+across web interfaces. We tackle these challenges by leveraging LLMs to
+decompose web tasks into a collection of sub-tasks, each of which can be solved
+by a low-level, closed-loop policy. These policies constitute a shared grammar
+across tasks, i.e., new web tasks can be expressed as a composition of these
+policies. We propose a novel framework, Hierarchical Policies for Web Actions
+using LLMs (HeaP), that learns a set of hierarchical LLM prompts from
+demonstrations for planning high-level tasks and executing them via a sequence
+of low-level policies. We evaluate HeaP against a range of baselines on a suite
+of web tasks, including MiniWoB++, WebArena, a mock airline CRM, as well as
+live website interactions, and show that it is able to outperform prior works
+using orders of magnitude less data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constraint-Conditioned Policy Optimization for Versatile Safe
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihang Yao, Zuxin Liu, Zhepeng Cen, Jiacheng Zhu, Wenhao Yu, Tingnan Zhang, Ding Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe reinforcement learning (RL) focuses on training reward-maximizing agents
+subject to pre-defined safety constraints. Yet, learning versatile safe
+policies that can adapt to varying safety constraint requirements during
+deployment without retraining remains a largely unexplored and challenging
+area. In this work, we formulate the versatile safe RL problem and consider two
+primary requirements: training efficiency and zero-shot adaptation capability.
+To address them, we introduce the Conditioned Constrained Policy Optimization
+(CCPO) framework, consisting of two key modules: (1) Versatile Value Estimation
+(VVE) for approximating value functions under unseen threshold conditions, and
+(2) Conditioned Variational Inference (CVI) for encoding arbitrary constraint
+thresholds during policy optimization. Our extensive experiments demonstrate
+that CCPO outperforms the baselines in terms of safety and task performance
+while preserving zero-shot adaptation capabilities to different constraint
+thresholds data-efficiently. This makes our approach suitable for real-world
+dynamic applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Long Way to Go: Investigating Length Correlations in RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prasann Singhal, Tanya Goyal, Jiacheng Xu, Greg Durrett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Great successes have been reported using Reinforcement Learning from Human
+Feedback (RLHF) to align large language models. Open-source preference datasets
+and reward models have enabled wider experimentation beyond generic chat
+settings, particularly to make systems more "helpful" for tasks like web
+question answering, summarization, and multi-turn dialogue. When optimizing for
+helpfulness, RLHF has been consistently observed to drive models to produce
+longer outputs. This paper demonstrates that optimizing for response length is
+a significant factor behind RLHF's reported improvements in these settings.
+First, we study the relationship between reward and length for reward models
+trained on three open-source preference datasets for helpfulness. Here, length
+correlates strongly with reward, and improvements in reward score are driven in
+large part by shifting the distribution over output lengths. We then explore
+interventions during both RL and reward model learning to see if we can achieve
+the same downstream improvements as RLHF without increasing length. While our
+interventions mitigate length increases, they aren't uniformly effective across
+settings. Furthermore, we find that even running RLHF with a reward based
+solely on length can reproduce most of the downstream improvements over the
+initial policy model, showing that reward models in these settings have a long
+way to go.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSPy: Compiling Declarative Language Model Calls into Self-Improving
+  Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T. Joshi, Hanna Moazam, Heather Miller, Matei Zaharia, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ML community is rapidly exploring techniques for prompting language
+models (LMs) and for stacking them into pipelines that solve complex tasks.
+Unfortunately, existing LM pipelines are typically implemented using hard-coded
+"prompt templates", i.e. lengthy strings discovered via trial and error. Toward
+a more systematic approach for developing and optimizing LM pipelines, we
+introduce DSPy, a programming model that abstracts LM pipelines as text
+transformation graphs, i.e. imperative computational graphs where LMs are
+invoked through declarative modules. DSPy modules are parameterized, meaning
+they can learn (by creating and collecting demonstrations) how to apply
+compositions of prompting, finetuning, augmentation, and reasoning techniques.
+We design a compiler that will optimize any DSPy pipeline to maximize a given
+metric. We conduct two case studies, showing that succinct DSPy programs can
+express and optimize sophisticated LM pipelines that reason about math word
+problems, tackle multi-hop retrieval, answer complex questions, and control
+agent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and
+llama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot
+prompting (generally by over 25% and 65%, respectively) and pipelines with
+expert-created demonstrations (by up to 5-46% and 16-40%, respectively). On top
+of that, DSPy programs compiled to open and relatively small LMs like
+770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely
+on expert-written prompt chains for proprietary GPT-3.5. DSPy is available at
+https://github.com/stanfordnlp/dspy
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agent Instructs Large Language Models to be General Zero-Shot Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Crispino, Kyle Montgomery, Fankun Zeng, Dawn Song, Chenguang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method to improve the zero-shot reasoning abilities of large
+language models on general language understanding tasks. Specifically, we build
+an autonomous agent to instruct the reasoning process of large language models.
+We show this approach further unleashes the zero-shot reasoning abilities of
+large language models to more tasks. We study the performance of our method on
+a wide set of datasets spanning generation, classification, and reasoning. We
+show that our method generalizes to most tasks and obtains state-of-the-art
+zero-shot performance on 20 of the 29 datasets that we evaluate. For instance,
+our method boosts the performance of state-of-the-art large language models by
+a large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and
+GPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement
+in reasoning is striking, with an average increase of 10.5%. With our method,
+Llama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond One-Preference-for-All: Multi-Objective Direct Preference
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhui Zhou, Jie Liu, Chao Yang, Jing Shao, Yu Liu, Xiangyu Yue, Wanli Ouyang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs), despite aligning well with an average labeler through
+reinforcement learning from human feedback (RLHF), may not universally suit
+diverse human preferences. Recent approaches therefore opt for customization by
+collecting multi-dimensional feedback and creating distinct rewards for each
+dimension (e.g., helpfulness, harmlessness, honesty). LMs can then be tailored
+to different preferences using multi-objective RL (MORL) with different reward
+weightings. Yet, RL fine-tuning is unstable and resource-heavy, especially for
+MORLHF with diverse and usually conflicting objectives. In this paper, we
+present Multi-Objective Direct Preference Optimization (MODPO), an RL-free
+algorithm that extends Direct Preference Optimization (DPO) for multiple
+alignment objectives. Essentially, MODPO trains different LMs to represent
+different collective reward models that combine all objectives with specific
+weightings. With a simple cross-entropy loss, the LMs optimized against the
+MODPO objective are analytically the exact solutions of the original MORLHF
+objective. Empirical results in safety alignment and long-form question
+answering confirm that MODPO matches or outperforms existing methods,
+efficiently producing a Pareto-optimal set of LMs that cater to diverse
+preferences with 3 times less computational resources compared with MORLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OMG-ATTACK: <span class="highlight-title">Self-Supervised</span> On-Manifold Generation of Transferable
+  Evasion Attacks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ofir Bar Tal, Adi Haviv, Amit H. Bermano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evasion Attacks (EA) are used to test the robustness of trained neural
+networks by distorting input data to misguide the model into incorrect
+classifications. Creating these attacks is a challenging task, especially with
+the ever-increasing complexity of models and datasets. In this work, we
+introduce a self-supervised, computationally economical method for generating
+adversarial examples, designed for the unseen black-box setting. Adapting
+techniques from representation learning, our method generates on-manifold EAs
+that are encouraged to resemble the data distribution. These attacks are
+comparable in effectiveness compared to the state-of-the-art when attacking the
+model trained on, but are significantly more effective when attacking unseen
+models, as the attacks are more related to the data rather than the model
+itself. Our experiments consistently demonstrate the method is effective across
+various models, unseen data categories, and even defended models, suggesting a
+significant role for on-manifold EAs when targeting unseen models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, AROW Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Banach Space Optimality of Neural Architectures With Multivariate
+  Nonlinearities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Parhi, Michael Unser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the variational optimality (specifically, the Banach space
+optimality) of a large class of neural architectures with multivariate
+nonlinearities/activation functions. To that end, we construct a new family of
+Banach spaces defined via a regularization operator and the $k$-plane
+transform. We prove a representer theorem that states that the solution sets to
+learning problems posed over these Banach spaces are completely characterized
+by neural architectures with multivariate nonlinearities. These optimal
+architectures have skip connections and are tightly connected to orthogonal
+weight normalization and multi-index models, both of which have received
+considerable interest in the neural network community. Our framework is
+compatible with a number of classical nonlinearities including the rectified
+linear unit (ReLU) activation function, the norm activation function, and the
+radial basis functions found in the theory of thin-plate/polyharmonic splines.
+We also show that the underlying spaces are special instances of reproducing
+kernel Banach spaces and variation spaces. Our results shed light on the
+regularity of functions learned by neural networks trained on data,
+particularly with multivariate nonlinearities, and provide new theoretical
+motivation for several architectural choices found in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimarginal generative modeling with stochastic interpolants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael S. Albergo, Nicholas M. Boffi, Michael Lindsey, Eric Vanden-Eijnden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a set of $K$ probability densities, we consider the multimarginal
+generative modeling problem of learning a joint distribution that recovers
+these densities as marginals. The structure of this joint distribution should
+identify multi-way correspondences among the prescribed marginals. We formalize
+an approach to this task within a generalization of the stochastic interpolant
+framework, leading to efficient learning algorithms built upon dynamical
+transport of measure. Our generative models are defined by velocity and score
+fields that can be characterized as the minimizers of simple quadratic
+objectives, and they are defined on a simplex that generalizes the time
+variable in the usual dynamical transport framework. The resulting transport on
+the simplex is influenced by all marginals, and we show that multi-way
+correspondences can be extracted. The identification of such correspondences
+has applications to style transfer, algorithmic fairness, and data
+decorruption. In addition, the multimarginal perspective enables an efficient
+algorithm for reducing the dynamical transport cost in the ordinary
+two-marginal setting. We demonstrate these capacities with several numerical
+examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-tuning Aligned Language Models Compromises Safety, Even When Users
+  Do Not Intend To! 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Qi, Yi Zeng, Tinghao Xie, Pin-Yu Chen, Ruoxi Jia, Prateek Mittal, Peter Henderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing large language models (LLMs) for downstream use cases often
+involves the customization of pre-trained LLMs through further fine-tuning.
+Meta's open release of Llama models and OpenAI's APIs for fine-tuning GPT-3.5
+Turbo on custom datasets also encourage this practice. But, what are the safety
+costs associated with such custom fine-tuning? We note that while existing
+safety alignment infrastructures can restrict harmful behaviors of LLMs at
+inference time, they do not cover safety risks when fine-tuning privileges are
+extended to end-users. Our red teaming studies find that the safety alignment
+of LLMs can be compromised by fine-tuning with only a few adversarially
+designed training examples. For instance, we jailbreak GPT-3.5 Turbo's safety
+guardrails by fine-tuning it on only 10 such examples at a cost of less than
+$0.20 via OpenAI's APIs, making the model responsive to nearly any harmful
+instructions. Disconcertingly, our research also reveals that, even without
+malicious intent, simply fine-tuning with benign and commonly used datasets can
+also inadvertently degrade the safety alignment of LLMs, though to a lesser
+extent. These findings suggest that fine-tuning aligned LLMs introduces new
+safety risks that current safety infrastructures fall short of addressing --
+even if a model's initial safety alignment is impeccable, it is not necessarily
+to be maintained after custom fine-tuning. We outline and critically analyze
+potential mitigations and advocate for further research efforts toward
+reinforcing safety protocols for the custom fine-tuning of aligned LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03684v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03684v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Robey, Eric Wong, Hamed Hassani, George J. Pappas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite efforts to align large language models (LLMs) with human values,
+widely-used LLMs such as GPT, Llama, Claude, and PaLM are susceptible to
+jailbreaking attacks, wherein an adversary fools a targeted LLM into generating
+objectionable content. To address this vulnerability, we propose SmoothLLM, the
+first algorithm designed to mitigate jailbreaking attacks on LLMs. Based on our
+finding that adversarially-generated prompts are brittle to character-level
+changes, our defense first randomly perturbs multiple copies of a given input
+prompt, and then aggregates the corresponding predictions to detect adversarial
+inputs. SmoothLLM reduces the attack success rate on numerous popular LLMs to
+below one percentage point, avoids unnecessary conservatism, and admits
+provable guarantees on attack mitigation. Moreover, our defense uses
+exponentially fewer queries than existing attacks and is compatible with any
+LLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hadamard Domain Training with Integers for Class Incremental Quantized
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Schiemer, Clemens JS Schaefer, Jayden Parker Vap, Mark James Horeni, Yu Emma Wang, Juan Ye, Siddharth Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning is a desirable feature in many modern machine learning
+applications, which allows in-field adaptation and updating, ranging from
+accommodating distribution shift, to fine-tuning, and to learning new tasks.
+For applications with privacy and low latency requirements, the compute and
+memory demands imposed by continual learning can be cost-prohibitive for
+resource-constraint edge platforms. Reducing computational precision through
+fully quantized training (FQT) simultaneously reduces memory footprint and
+increases compute efficiency for both training and inference. However,
+aggressive quantization especially integer FQT typically degrades model
+accuracy to unacceptable levels. In this paper, we propose a technique that
+leverages inexpensive Hadamard transforms to enable low-precision training with
+only integer matrix multiplications. We further determine which tensors need
+stochastic rounding and propose tiled matrix multiplication to enable low-bit
+width accumulators. We demonstrate the effectiveness of our technique on
+several human activity recognition datasets and CIFAR100 in a class incremental
+learning setting. We achieve less than 0.5% and 3% accuracy degradation while
+we quantize all matrix multiplications inputs down to 4-bits with 8-bit
+accumulators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strategic Evaluation: Subjects, Evaluators, and Society 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Laufer, Jon Kleinberg, Karen Levy, Helen Nissenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A broad current application of algorithms is in formal and quantitative
+measures of murky concepts -- like merit -- to make decisions. When people
+strategically respond to these sorts of evaluations in order to gain favorable
+decision outcomes, their behavior can be subjected to moral judgments. They may
+be described as 'gaming the system' or 'cheating,' or (in other cases)
+investing 'honest effort' or 'improving.' Machine learning literature on
+strategic behavior has tried to describe these dynamics by emphasizing the
+efforts expended by decision subjects hoping to obtain a more favorable
+assessment -- some works offer ways to preempt or prevent such manipulations,
+some differentiate 'gaming' from 'improvement' behavior, while others aim to
+measure the effort burden or disparate effects of classification systems. We
+begin from a different starting point: that the design of an evaluation itself
+can be understood as furthering goals held by the evaluator which may be
+misaligned with broader societal goals. To develop the idea that evaluation
+represents a strategic interaction in which both the evaluator and the subject
+of their evaluation are operating out of self-interest, we put forward a model
+that represents the process of evaluation using three interacting agents: a
+decision subject, an evaluator, and society, representing a bundle of values
+and oversight mechanisms. We highlight our model's applicability to a number of
+social systems where one or two players strategically undermine the others'
+interests to advance their own. Treating evaluators as themselves strategic
+allows us to re-cast the scrutiny directed at decision subjects, towards the
+incentives that underpin institutional designs of evaluations. The moral
+standing of strategic behaviors often depend on the moral standing of the
+evaluations and incentives that provoke such behaviors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures, EAAMO 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extreme sparsification of physics-augmented neural networks for
+  interpretable model discovery in mechanics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan N. Fuhg, Reese E. Jones, Nikolaos Bouklas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven constitutive modeling with neural networks has received increased
+interest in recent years due to its ability to easily incorporate physical and
+mechanistic constraints and to overcome the challenging and time-consuming task
+of formulating phenomenological constitutive laws that can accurately capture
+the observed material response. However, even though neural network-based
+constitutive laws have been shown to generalize proficiently, the generated
+representations are not easily interpretable due to their high number of
+trainable parameters. Sparse regression approaches exist that allow to
+obtaining interpretable expressions, but the user is tasked with creating a
+library of model forms which by construction limits their expressiveness to the
+functional forms provided in the libraries. In this work, we propose to train
+regularized physics-augmented neural network-based constitutive models
+utilizing a smoothed version of $L^{0}$-regularization. This aims to maintain
+the trustworthiness inherited by the physical constraints, but also enables
+interpretability which has not been possible thus far on any type of machine
+learning-based constitutive model where model forms were not assumed a-priory
+but were actually discovered. During the training process, the network
+simultaneously fits the training data and penalizes the number of active
+parameters, while also ensuring constitutive constraints such as thermodynamic
+consistency. We show that the method can reliably obtain interpretable and
+trustworthy constitutive models for compressible and incompressible
+hyperelasticity, yield functions, and hardening models for elastoplasticity,
+for synthetic and experimental data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 19 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Fairness for Human-AI Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haosen Ge, Hamsa Bastani, Osbert Bastani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches to algorithmic fairness aim to ensure equitable outcomes
+if human decision-makers comply perfectly with algorithmic decisions. However,
+perfect compliance with the algorithm is rarely a reality or even a desirable
+outcome in human-AI collaboration. Yet, recent studies have shown that
+selective compliance with fair algorithms can amplify discrimination relative
+to the prior human policy. As a consequence, ensuring equitable outcomes
+requires fundamentally different algorithmic design principles that ensure
+robustness to the decision-maker's (a priori unknown) compliance pattern. We
+define the notion of compliance-robustly fair algorithmic recommendations that
+are guaranteed to (weakly) improve fairness in decisions, regardless of the
+human's compliance pattern. We propose a simple optimization strategy to
+identify the best performance-improving compliance-robustly fair policy.
+However, we show that it may be infeasible to design algorithmic
+recommendations that are simultaneously fair in isolation, compliance-robustly
+fair, and more accurate than the human policy; thus, if our goal is to improve
+the equity and accuracy of human-AI collaboration, it may not be desirable to
+enforce traditional fairness constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TRAM: Bridging Trust Regions and Sharpness Aware Minimization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Sherborne, Naomi Saphra, Pradeep Dasigi, Hao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By reducing the curvature of the loss surface in the parameter space,
+Sharpness-aware minimization (SAM) yields widespread robustness improvement
+under domain transfer. Instead of focusing on parameters, however, this work
+considers the transferability of representations as the optimization target for
+out-of-domain generalization in a fine-tuning setup. To encourage the retention
+of transferable representations, we consider trust region-based fine-tuning
+methods, which exploit task-specific skills without forgetting task-agnostic
+representations from pre-training. We unify parameter- and representation-space
+smoothing approaches by using trust region bounds to inform SAM-style
+regularizers on both of these optimization surfaces. We propose Trust Region
+Aware Minimization (TRAM), a fine-tuning algorithm that optimizes for flat
+minima and smooth, informative representations without forgetting pre-trained
+structure. We find that TRAM outperforms both sharpness-aware and trust
+region-based optimization methods on cross-domain language modeling and
+cross-lingual transfer, where robustness to domain transfer and representation
+generality are critical for success. TRAM establishes a new standard in
+training generalizable models with minimal additional computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 tables, 1 figure. Submitted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributional PAC-Learning from Nisan's Natural Proofs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ari Karchmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  (Abridged) Carmosino et al. (2016) demonstrated that natural proofs of
+circuit lower bounds for \Lambda imply efficient algorithms for learning
+\Lambda-circuits, but only over the uniform distribution, with membership
+queries, and provided \AC^0[p] \subseteq \Lambda. We consider whether this
+implication can be generalized to \Lambda \not\supseteq \AC^0[p], and to
+learning algorithms in Valiant's PAC model, which use only random examples and
+learn over arbitrary example distributions. We give results of both positive
+and negative flavor.
+  On the negative side, we observe that if, for every circuit class \Lambda,
+the implication from natural proofs for \Lambda to learning \Lambda-circuits in
+Valiant's PAC model holds, then there is a polynomial time solution to
+O(n^{1.5})-uSVP (unique Shortest Vector Problem), and polynomial time quantum
+solutions to O(n^{1.5})-SVP (Shortest Vector Problem) and O(n^{1.5})-SIVP
+(Shortest Independent Vector Problem). This indicates that whether natural
+proofs for \Lambda imply efficient learning algorithms for \Lambda in Valiant's
+PAC model may depend on \Lambda.
+  On the positive side, our main result is that specific natural proofs arising
+from a type of communication complexity argument (e.g., Nisan (1993), for
+depth-2 majority circuits) imply PAC-learning algorithms in a new
+distributional variant of Valiant's model. Our distributional PAC model is
+stronger than the average-case prediction model of Blum et al (1993) and the
+heuristic PAC model of Nanashima (2021), and has several important properties
+which make it of independent interest, such as being boosting-friendly. The
+main applications of our result are new distributional PAC-learning algorithms
+for depth-2 majority circuits, polytopes and DNFs over natural target
+distributions, as well as the nonexistence of encoded-input weak PRFs that can
+be evaluated by depth-2 majority circuits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLEVRER-Humans: Describing Physical and Causal Events the Human Way <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayuan Mao, Xuelin Yang, Xikun Zhang, Noah D. Goodman, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building machines that can reason about physical events and their causal
+relationships is crucial for flexible interaction with the physical world.
+However, most existing physical and causal reasoning benchmarks are exclusively
+based on synthetically generated events and synthetic natural language
+descriptions of causal relationships. This design brings up two issues. First,
+there is a lack of diversity in both event types and natural language
+descriptions; second, causal relationships based on manually-defined heuristics
+are different from human judgments. To address both shortcomings, we present
+the CLEVRER-Humans benchmark, a video reasoning dataset for causal judgment of
+physical events with human labels. We employ two techniques to improve data
+collection efficiency: first, a novel iterative event cloze task to elicit a
+new representation of events in videos, which we term Causal Event Graphs
+(CEGs); second, a data augmentation technique based on neural language
+generative models. We convert the collected CEGs into questions and answers to
+be consistent with prior work. Finally, we study a collection of baseline
+approaches for CLEVRER-Humans question-answering, highlighting the great
+challenges set forth by our benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 (Dataset and Benchmark Track). First two authors
+  contributed equally. Project page:
+  https://sites.google.com/stanford.edu/clevrer-humans/home</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling
+  and Motion Planning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Schulze, Hod Lipson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A robot self-model is a task-agnostic representation of the robot's physical
+morphology that can be used for motion planning tasks in absence of classical
+geometric kinematic models. In particular, when the latter are hard to engineer
+or the robot's kinematics change unexpectedly, human-free self-modeling is a
+necessary feature of truly autonomous agents. In this work, we leverage neural
+fields to allow a robot to self-model its kinematics as a neural-implicit query
+model learned only from 2D images annotated with camera poses and
+configurations. This enables significantly greater applicability than existing
+approaches which have been dependent on depth images or geometry knowledge. To
+this end, alongside a curricular data sampling strategy, we propose a new
+encoder-based neural density field architecture for dynamic object-centric
+scenes conditioned on high numbers of degrees of freedom (DOFs). In a 7-DOF
+robot test setup, the learned self-model achieves a Chamfer-L2 distance of 2%
+of the robot's workspace dimension. We demonstrate the capabilities of this
+model on a motion planning task as an exemplary downstream application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Workshop on Neural Fields for Autonomous Driving and
+  Robotics (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLASSify: A Web-Based Tool for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron D. Mullen, Samuel E. Armstrong, Jeff Talbert, V. K. Cody Bumgardner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning classification problems are widespread in bioinformatics,
+but the technical knowledge required to perform model training, optimization,
+and inference can prevent researchers from utilizing this technology. This
+article presents an automated tool for machine learning classification problems
+to simplify the process of training models and producing results while
+providing informative visualizations and insights into the data. This tool
+supports both binary and multiclass classification problems, and it provides
+access to a variety of models and methods. Synthetic data can be generated
+within the interface to fill missing values, balance class labels, or generate
+entirely new datasets. It also provides support for feature evaluation and
+generates explainability scores to indicate which features influence the output
+the most. We present CLASSify, an open-source tool for simplifying the user
+experience of solving classification problems without the need for knowledge of
+machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures (3 images, 5 graphs, 3 tables)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Machine Learning for Social Good: Reframing the Adversary as
+  an Ally 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03614v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03614v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shawqi Al-Maliki, Adnan Qayyum, Hassan Ali, Mohamed Abdallah, Junaid Qadir, Dinh Thai Hoang, Dusit Niyato, Ala Al-Fuqaha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) have been the driving force behind many of the
+recent advances in machine learning. However, research has shown that DNNs are
+vulnerable to adversarial examples -- input samples that have been perturbed to
+force DNN-based models to make errors. As a result, Adversarial Machine
+Learning (AdvML) has gained a lot of attention, and researchers have
+investigated these vulnerabilities in various settings and modalities. In
+addition, DNNs have also been found to incorporate embedded bias and often
+produce unexplainable predictions, which can result in anti-social AI
+applications. The emergence of new AI technologies that leverage Large Language
+Models (LLMs), such as ChatGPT and GPT-4, increases the risk of producing
+anti-social applications at scale. AdvML for Social Good (AdvML4G) is an
+emerging field that repurposes the AdvML bug to invent pro-social applications.
+Regulators, practitioners, and researchers should collaborate to encourage the
+development of pro-social applications and hinder the development of
+anti-social ones. In this work, we provide the first comprehensive review of
+the emerging field of AdvML4G. This paper encompasses a taxonomy that
+highlights the emergence of AdvML4G, a discussion of the differences and
+similarities between AdvML4G and AdvML, a taxonomy covering social good-related
+concepts and aspects, an exploration of the motivations behind the emergence of
+AdvML4G at the intersection of ML4G and AdvML, and an extensive summary of the
+works that utilize AdvML4G as an auxiliary tool for innovating pro-social
+applications. Finally, we elaborate upon various challenges and open research
+issues that require significant attention from the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving a Class of Non-Convex Minimax Optimization in Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xidong Wu, Jianhui Sun, Zhengmian Hu, Aidong Zhang, Heng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The minimax problems arise throughout machine learning applications, ranging
+from adversarial training and policy evaluation in reinforcement learning to
+AUROC maximization. To address the large-scale data challenges across multiple
+clients with communication-efficient distributed training, federated learning
+(FL) is gaining popularity. Many optimization algorithms for minimax problems
+have been developed in the centralized setting (\emph{i.e.} single-machine).
+Nonetheless, the algorithm for minimax problems under FL is still
+underexplored. In this paper, we study a class of federated nonconvex minimax
+optimization problems. We propose FL algorithms (FedSGDA+ and FedSGDA-M) and
+reduce existing complexity results for the most common minimax problems. For
+nonconvex-concave problems, we propose FedSGDA+ and reduce the communication
+complexity to $O(\varepsilon^{-6})$. Under nonconvex-strongly-concave and
+nonconvex-PL minimax settings, we prove that FedSGDA-M has the best-known
+sample complexity of $O(\kappa^{3} N^{-1}\varepsilon^{-3})$ and the best-known
+communication complexity of $O(\kappa^{2}\varepsilon^{-2})$. FedSGDA-M is the
+first algorithm to match the best sample complexity $O(\varepsilon^{-3})$
+achieved by the single-machine method under the nonconvex-strongly-concave
+setting. Extensive experimental results on fair classification and AUROC
+maximization show the efficiency of our algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GENER: A Parallel Layer Deep Learning Network To Detect Gene-Gene
+  Interactions From Gene Expression Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Fakhry Elnaggar, Raneem Ali Khafagy, Adriaan-Alexander Ludl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting and discovering new gene interactions based on known gene
+expressions and gene interaction data presents a significant challenge. Various
+statistical and deep learning methods have attempted to tackle this challenge
+by leveraging the topological structure of gene interactions and gene
+expression patterns to predict novel gene interactions. In contrast, some
+approaches have focused exclusively on utilizing gene expression profiles. In
+this context, we introduce GENER, a parallel-layer deep learning network
+designed exclusively for the identification of gene-gene relationships using
+gene expression data. We conducted two training experiments and compared the
+performance of our network with that of existing statistical and deep learning
+approaches. Notably, our model achieved an average AUROC score of 0.834 on the
+combined BioGRID&DREAM5 dataset, outperforming competing methods in predicting
+gene-gene interactions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Time-Series Analysis Approaches Utilized in Research Papers to
+  Forecast COVID-19 Cases in Africa: A Literature <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Ebadi, Ebrahim Sahafizadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This literature review aimed to compare various time-series analysis
+approaches utilized in forecasting COVID-19 cases in Africa. The study involved
+a methodical search for English-language research papers published between
+January 2020 and July 2023, focusing specifically on papers that utilized
+time-series analysis approaches on COVID-19 datasets in Africa. A variety of
+databases including PubMed, Google Scholar, Scopus, and Web of Science were
+utilized for this process. The research papers underwent an evaluation process
+to extract relevant information regarding the implementation and performance of
+the time-series analysis models. The study highlighted the different
+methodologies employed, evaluating their effectiveness and limitations in
+forecasting the spread of the virus. The result of this review could contribute
+deeper insights into the field, and future research should consider these
+insights to improve time series analysis models and explore the integration of
+different approaches for enhanced public health decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FASER: Binary Code Similarity Search through the use of Intermediate
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josh Collyer, Tim Watson, Iain Phillips
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to identify functions of interest in cross-architecture software
+is useful whether you are analysing for malware, securing the software supply
+chain or conducting vulnerability research. Cross-Architecture Binary Code
+Similarity Search has been explored in numerous studies and has used a wide
+range of different data sources to achieve its goals. The data sources
+typically used draw on common structures derived from binaries such as function
+control flow graphs or binary level call graphs, the output of the disassembly
+process or the outputs of a dynamic analysis approach. One data source which
+has received less attention is binary intermediate representations. Binary
+Intermediate representations possess two interesting properties: they are cross
+architecture by their very nature and encode the semantics of a function
+explicitly to support downstream usage. Within this paper we propose Function
+as a String Encoded Representation (FASER) which combines long document
+transformers with the use of intermediate representations to create a model
+capable of cross architecture function search without the need for manual
+feature engineering, pre-training or a dynamic analysis step. We compare our
+approach against a series of baseline approaches for two tasks; A general
+function search task and a targeted vulnerability search task. Our approach
+demonstrates strong performance across both tasks, performing better than all
+baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, To be presented as Conference on Applied Machine Learning
+  for Information Security</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sampling via Gradient Flows in the Space of Probability Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Chen, Daniel Zhengyu Huang, Jiaoyang Huang, Sebastian Reich, Andrew M Stuart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sampling a target probability distribution with an unknown normalization
+constant is a fundamental challenge in computational science and engineering.
+Recent work shows that algorithms derived by considering gradient flows in the
+space of probability measures open up new avenues for algorithm development.
+This paper makes three contributions to this sampling approach by scrutinizing
+the design components of such gradient flows. Any instantiation of a gradient
+flow for sampling needs an energy functional and a metric to determine the
+flow, as well as numerical approximations of the flow to derive algorithms. Our
+first contribution is to show that the Kullback-Leibler divergence, as an
+energy functional, has the unique property (among all f-divergences) that
+gradient flows resulting from it do not depend on the normalization constant of
+the target distribution. Our second contribution is to study the choice of
+metric from the perspective of invariance. The Fisher-Rao metric is known as
+the unique choice (up to scaling) that is diffeomorphism invariant. As a
+computationally tractable alternative, we introduce a relaxed, affine
+invariance property for the metrics and gradient flows. In particular, we
+construct various affine invariant Wasserstein and Stein gradient flows. Affine
+invariant gradient flows are shown to behave more favorably than their
+non-affine-invariant counterparts when sampling highly anisotropic
+distributions, in theory and by using particle methods. Our third contribution
+is to study, and develop efficient algorithms based on Gaussian approximations
+of the gradient flows; this leads to an alternative to particle methods. We
+establish connections between various Gaussian approximate gradient flows,
+discuss their relation to gradient methods arising from parametric variational
+inference, and study their convergence properties both theoretically and
+numerically.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2302.11024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time<span class="highlight-title">GPT</span>-1 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03589v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03589v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azul Garza, Max Mergenthaler-Canseco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce TimeGPT, the first foundation model for time
+series, capable of generating accurate predictions for diverse datasets not
+seen during training. We evaluate our pre-trained model against established
+statistical, machine learning, and deep learning methods, demonstrating that
+TimeGPT zero-shot inference excels in performance, efficiency, and simplicity.
+Our study provides compelling evidence that insights from other domains of
+artificial intelligence can be effectively applied to time series analysis. We
+conclude that large-scale time series models offer an exciting opportunity to
+democratize access to precise predictions and reduce uncertainty by leveraging
+the capabilities of contemporary advancements in deep learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smoothing Methods for Automatic Differentiation Across Conditional
+  Branches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin N. Kreikemeyer, Philipp Andelfinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programs involving discontinuities introduced by control flow constructs such
+as conditional branches pose challenges to mathematical optimization methods
+that assume a degree of smoothness in the objective function's response
+surface. Smooth interpretation (SI) is a form of abstract interpretation that
+approximates the convolution of a program's output with a Gaussian kernel, thus
+smoothing its output in a principled manner. Here, we combine SI with automatic
+differentiation (AD) to efficiently compute gradients of smoothed programs. In
+contrast to AD across a regular program execution, these gradients also capture
+the effects of alternative control flow paths. The combination of SI with AD
+enables the direct gradient-based parameter synthesis for branching programs,
+allowing for instance the calibration of simulation models or their combination
+with neural network models in machine learning pipelines. We detail the effects
+of the approximations made for tractability in SI and propose a novel Monte
+Carlo estimator that avoids the underlying assumptions by estimating the
+smoothed programs' gradients through a combination of AD and sampling. Using
+DiscoGrad, our tool for automatically translating simple C++ programs to a
+smooth differentiable form, we perform an extensive evaluation. We compare the
+combination of SI with AD and our Monte Carlo estimator to existing
+gradient-free and stochastic methods on four non-trivial and originally
+discontinuous problems ranging from classical simulation-based optimization to
+neural network-driven control. While the optimization progress with the
+SI-based estimator depends on the complexity of the programs' control flow, our
+Monte Carlo estimator is competitive in all problems, exhibiting the fastest
+convergence by a substantial margin in our highest-dimensional problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resilient Legged Local Navigation: Learning to Traverse with Compromised
+  Perception End-to-End 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Jin, Chong Zhang, Jonas Frey, Nikita Rudin, Matias Mattamala, Cesar Cadena, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous robots must navigate reliably in unknown environments even under
+compromised exteroceptive perception, or perception failures. Such failures
+often occur when harsh environments lead to degraded sensing, or when the
+perception algorithm misinterprets the scene due to limited generalization. In
+this paper, we model perception failures as invisible obstacles and pits, and
+train a reinforcement learning (RL) based local navigation policy to guide our
+legged robot. Unlike previous works relying on heuristics and anomaly detection
+to update navigational information, we train our navigation policy to
+reconstruct the environment information in the latent space from corrupted
+perception and react to perception failures end-to-end. To this end, we
+incorporate both proprioception and exteroception into our policy inputs,
+thereby enabling the policy to sense collisions on different body parts and
+pits, prompting corresponding reactions. We validate our approach in simulation
+and on the real quadruped robot ANYmal running in real-time (<10 ms CPU
+inference). In a quantitative comparison with existing heuristic-based locally
+reactive planners, our policy increases the success rate over 30% when facing
+perception failures. Project Page: https://bit.ly/45NBTuh.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website and videos are available at our Project Page:
+  https://bit.ly/45NBTuh</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Targeted Adversarial Attacks on Generalizable Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andras Horvath, Csaba M. Jozsa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have recently emerged as a powerful tool for
+3D scene representation and rendering. These data-driven models can learn to
+synthesize high-quality images from sparse 2D observations, enabling realistic
+and interactive scene reconstructions. However, the growing usage of NeRFs in
+critical applications such as augmented reality, robotics, and virtual
+environments could be threatened by adversarial attacks.
+  In this paper we present how generalizable NeRFs can be attacked by both
+low-intensity adversarial attacks and adversarial patches, where the later
+could be robust enough to be used in real world applications. We also
+demonstrate targeted attacks, where a specific, predefined output scene is
+generated by these attack with success.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of learning a flow-based generative model from limited sample
+  complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Cui, Florent Krzakala, Eric Vanden-Eijnden, Lenka Zdeborová
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of training a flow-based generative model, parametrized
+by a two-layer autoencoder, to sample from a high-dimensional Gaussian mixture.
+We provide a sharp end-to-end analysis of the problem. First, we provide a
+tight closed-form characterization of the learnt velocity field, when
+parametrized by a shallow denoising auto-encoder trained on a finite number $n$
+of samples from the target distribution. Building on this analysis, we provide
+a sharp description of the corresponding generative flow, which pushes the base
+Gaussian density forward to an approximation of the target density. In
+particular, we provide closed-form formulae for the distance between the mean
+of the generated mixture and the mean of the target mixture, which we show
+decays as $\Theta_n(\frac{1}{n})$. Finally, this rate is shown to be in fact
+Bayes-optimal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Residual Multi-Fidelity Neural Network Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Owen Davis, Mohammad Motamed, Raul Tempone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we consider the general problem of constructing a neural
+network surrogate model using multi-fidelity information. Given an inexpensive
+low-fidelity and an expensive high-fidelity computational model, we present a
+residual multi-fidelity computational framework that formulates the correlation
+between models as a residual function, a possibly non-linear mapping between 1)
+the shared input space of the models together with the low-fidelity model
+output and 2) the discrepancy between the two model outputs. To accomplish
+this, we train two neural networks to work in concert. The first network learns
+the residual function on a small set of high-fidelity and low-fidelity data.
+Once trained, this network is used to generate additional synthetic
+high-fidelity data, which is used in the training of a second network. This
+second network, once trained, acts as our surrogate for the high-fidelity
+quantity of interest. We present three numerical examples to demonstrate the
+power of the proposed framework. In particular, we show that dramatic savings
+in computational cost may be achieved when the output predictions are desired
+to be accurate within small tolerances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance
+  Fields <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ágoston István Csehi, Csaba Máté Józsa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to improve the Inverted Neural Radiance Fields (iNeRF) algorithm which
+defines the image pose estimation problem as a NeRF based iterative linear
+optimization. NeRFs are novel neural space representation models that can
+synthesize photorealistic novel views of real-world scenes or objects. Our
+contributions are as follows: we extend the localization optimization objective
+with a depth-based loss function, we introduce a multi-image based loss
+function where a sequence of images with known relative poses are used without
+increasing the computational complexity, we omit hierarchical sampling during
+volumetric rendering, meaning only the coarse model is used for pose
+estimation, and we how that by extending the sampling interval convergence can
+be achieved even or higher initial pose estimate errors. With the proposed
+modifications the convergence speed is significantly improved, and the basin of
+convergence is substantially extended.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Nerf4ADR workshop of ICCV23 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stable Training of Probabilistic Models Using the Leave-One-Out Maximum
+  Log-Likelihood Objective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kutay Bölat, Simon H. Tindemans, Peter Palensky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Probabilistic modelling of power systems operation and planning processes
+depends on data-driven methods, which require sufficiently large datasets. When
+historical data lacks this, it is desired to model the underlying data
+generation mechanism as a probability distribution to assess the data quality
+and generate more data, if needed. Kernel density estimation (KDE) based models
+are popular choices for this task, but they fail to adapt to data regions with
+varying densities. In this paper, an adaptive KDE model is employed to
+circumvent this, where each kernel in the model has an individual bandwidth.
+The leave-one-out maximum log-likelihood (LOO-MLL) criterion is proposed to
+prevent the singular solutions that the regular MLL criterion gives rise to,
+and it is proven that LOO-MLL prevents these. Relying on this guaranteed
+robustness, the model is extended by assigning learnable weights to the
+kernels. In addition, a modified expectation-maximization algorithm is employed
+to accelerate the optimization speed reliably. The performance of the proposed
+method and models are exhibited on two power systems datasets using different
+statistical tests and by comparison with Gaussian mixture models. Results show
+that the proposed models have promising performance, in addition to their
+singularity prevention guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Plug-and-Play Posterior Sampling under Mismatched Measurement and Prior
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marien Renaud, Jiaming Liu, Valentin de Bortoli, Andrés Almansa, Ulugbek S. Kamilov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Posterior sampling has been shown to be a powerful Bayesian approach for
+solving imaging inverse problems. The recent plug-and-play unadjusted Langevin
+algorithm (PnP-ULA) has emerged as a promising method for Monte Carlo sampling
+and minimum mean squared error (MMSE) estimation by combining physical
+measurement models with deep-learning priors specified using image denoisers.
+However, the intricate relationship between the sampling distribution of
+PnP-ULA and the mismatched data-fidelity and denoiser has not been
+theoretically analyzed. We address this gap by proposing a posterior-L2
+pseudometric and using it to quantify an explicit error bound for PnP-ULA under
+mismatched posterior distribution. We numerically validate our theory on
+several inverse problems such as sampling from Gaussian mixture models and
+image deblurring. Our results suggest that the sensitivity of the sampling
+distribution of PnP-ULA to a mismatch in the measurement model and the denoiser
+can be precisely characterized.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distribution-free risk assessment of regression-based machine learning
+  algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sukrita Singh, Neeraj Sarna, Yuanyuan Li, Yang Li, Agni Orfanoudaki, Michael Berger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning algorithms have grown in sophistication over the years and
+are increasingly deployed for real-life applications. However, when using
+machine learning techniques in practical settings, particularly in high-risk
+applications such as medicine and engineering, obtaining the failure
+probability of the predictive model is critical. We refer to this problem as
+the risk-assessment task. We focus on regression algorithms and the
+risk-assessment task of computing the probability of the true label lying
+inside an interval defined around the model's prediction. We solve the
+risk-assessment problem using the conformal prediction approach, which provides
+prediction intervals that are guaranteed to contain the true label with a given
+probability. Using this coverage property, we prove that our approximated
+failure probability is conservative in the sense that it is not lower than the
+true failure probability of the ML algorithm. We conduct extensive experiments
+to empirically study the accuracy of the proposed method for problems with and
+without covariate shift. Our analysis focuses on different modeling regimes,
+dataset sizes, and conformal prediction methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Group Invariant Functions on Data-Parameter Domain Induce
+  Universal Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sho Sonoda, Hideyuki Ishi, Isao Ishikawa, Masahiro Ikeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The symmetry and geometry of input data are considered to be encoded in the
+internal data representation inside the neural network, but the specific
+encoding rule has been less investigated. By focusing on a joint group
+invariant function on the data-parameter domain, we present a systematic rule
+to find a dual group action on the parameter domain from a group action on the
+data domain. Further, we introduce generalized neural networks induced from the
+joint invariant functions, and present a new group theoretic proof of their
+universality theorems by using Schur's lemma. Since traditional universality
+theorems were demonstrated based on functional analytical methods, this study
+sheds light on the group theoretic aspect of the approximation theory,
+connecting geometric deep learning to abstract harmonic analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Ridgelet Transform: Voice with Koopman Operator Proves Universality
+  of Formal Deep Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sho Sonoda, Yuka Hashimoto, Isao Ishikawa, Masahiro Ikeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We identify hidden layers inside a DNN with group actions on the data space,
+and formulate the DNN as a dual voice transform with respect to Koopman
+operator, a linear representation of the group action. Based on the group
+theoretic arguments, particularly by using Schur's lemma, we show a simple
+proof of the universality of those DNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-dimensional Bayesian Optimization with Group Testing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Orm Hellsten, Carl Hvarfner, Leonard Papenmeier, Luigi Nardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization is an effective method for optimizing
+expensive-to-evaluate black-box functions. High-dimensional problems are
+particularly challenging as the surrogate model of the objective suffers from
+the curse of dimensionality, which makes accurate modeling difficult. We
+propose a group testing approach to identify active variables to facilitate
+efficient optimization in these domains. The proposed algorithm, Group Testing
+Bayesian Optimization (GTBO), first runs a testing phase where groups of
+variables are systematically selected and tested on whether they influence the
+objective. To that end, we extend the well-established theory of group testing
+to functions of continuous ranges. In the second phase, GTBO guides
+optimization by placing more importance on the active dimensions. By exploiting
+the axis-aligned subspace assumption, GTBO is competitive against
+state-of-the-art methods on several synthetic and real-world high-dimensional
+optimization tasks. Furthermore, GTBO aids in the discovery of active
+parameters in applications, thereby enhancing practitioners' understanding of
+the problem at hand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Otago Exercises Monitoring for Older Adults by a Single IMU and
+  Hierarchical Machine Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Shang, Lenore Dedeyne, Jolan Dupont, Laura Vercauteren, Nadjia Amini, Laurence Lapauw, Evelien Gielen, Sabine Verschueren, Carolina Varon, Walter De Raedt, Bart Vanrumste
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Otago Exercise Program (OEP) is a rehabilitation program for older adults to
+improve frailty, sarcopenia, and balance. Accurate monitoring of patient
+involvement in OEP is challenging, as self-reports (diaries) are often
+unreliable. With the development of wearable sensors, Human Activity
+Recognition (HAR) systems using wearable sensors have revolutionized
+healthcare. However, their usage for OEP still shows limited performance. The
+objective of this study is to build an unobtrusive and accurate system to
+monitor OEP for older adults. Data was collected from older adults wearing a
+single waist-mounted Inertial Measurement Unit (IMU). Two datasets were
+collected, one in a laboratory setting, and one at the homes of the patients. A
+hierarchical system is proposed with two stages: 1) using a deep learning model
+to recognize whether the patients are performing OEP or activities of daily
+life (ADLs) using a 10-minute sliding window; 2) based on stage 1, using a
+6-second sliding window to recognize the OEP sub-classes performed. The results
+showed that in stage 1, OEP could be recognized with window-wise f1-scores over
+0.95 and Intersection-over-Union (IoU) f1-scores over 0.85 for both datasets.
+In stage 2, for the home scenario, four activities could be recognized with
+f1-scores over 0.8: ankle plantarflexors, abdominal muscles, knee bends, and
+sit-to-stand. The results showed the potential of monitoring the compliance of
+OEP using a single IMU in daily life. Also, some OEP sub-classes are possible
+to be recognized for further analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Generative Models of Music Expectation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03500v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03500v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ninon Lizé Masclef, T. Anderson Keller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A prominent theory of affective response to music revolves around the
+concepts of surprisal and expectation. In prior work, this idea has been
+operationalized in the form of probabilistic models of music which allow for
+precise computation of song (or note-by-note) probabilities, conditioned on a
+'training set' of prior musical or cultural experiences. To date, however,
+these models have been limited to compute exact probabilities through
+hand-crafted features or restricted to linear models which are likely not
+sufficient to represent the complex conditional distributions present in music.
+In this work, we propose to use modern deep probabilistic generative models in
+the form of a Diffusion Model to compute an approximate likelihood of a musical
+input sequence. Unlike prior work, such a generative model parameterized by
+deep neural networks is able to learn complex non-linear features directly from
+a training set itself. In doing so, we expect to find that such models are able
+to more accurately represent the 'surprisal' of music for human listeners. From
+the literature, it is known that there is an inverted U-shaped relationship
+between surprisal and the amount human subjects 'like' a given song. In this
+work we show that pre-trained diffusion models indeed yield musical surprisal
+values which exhibit a negative quadratic relationship with measured subject
+'liking' ratings, and that the quality of this relationship is competitive with
+state of the art methods such as IDyOM. We therefore present this model a
+preliminary step in developing modern deep generative models of music
+expectation and subjective likability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How the level sampling process impacts zero-shot generalisation in deep
+  reinforcement learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Garcin, James Doran, Shangmin Guo, Christopher G. Lucas, Stefano V. Albrecht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key limitation preventing the wider adoption of autonomous agents trained
+via deep reinforcement learning (RL) is their limited ability to generalise to
+new environments, even when these share similar characteristics with
+environments encountered during training. In this work, we investigate how a
+non-uniform sampling strategy of individual environment instances, or levels,
+affects the zero-shot generalisation (ZSG) ability of RL agents, considering
+two failure modes: overfitting and over-generalisation. As a first step, we
+measure the mutual information (MI) between the agent's internal representation
+and the set of training levels, which we find to be well-correlated to instance
+overfitting. In contrast to uniform sampling, adaptive sampling strategies
+prioritising levels based on their value loss are more effective at maintaining
+lower MI, which provides a novel theoretical justification for this class of
+techniques. We then turn our attention to unsupervised environment design (UED)
+methods, which adaptively generate new training levels and minimise MI more
+effectively than methods sampling from a fixed set. However, we find UED
+methods significantly shift the training distribution, resulting in
+over-generalisation and worse ZSG performance over the distribution of
+interest. To prevent both instance overfitting and over-generalisation, we
+introduce self-supervised environment design (SSED). SSED generates levels
+using a variational autoencoder, effectively reducing MI while minimising the
+shift with the distribution of interest, and leads to statistically significant
+improvements in ZSG over fixed-set level sampling strategies and UED methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review, 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TPDR: A Novel Two-Step <span class="highlight-title">Transformer</span>-based Product and Class Description
+  Match and Retrieval Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Washington Cunha, Celso França, Leonardo Rocha, Marcos André Gonçalves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a niche of companies responsible for intermediating the purchase of
+large batches of varied products for other companies, for which the main
+challenge is to perform product description standardization, i.e., matching an
+item described by a client with a product described in a catalog. The problem
+is complex since the client's product description may be: (1) potentially
+noisy; (2) short and uninformative (e.g., missing information about model and
+size); and (3) cross-language. In this paper, we formalize this problem as a
+ranking task: given an initial client product specification (query), return the
+most appropriate standardized descriptions (response). In this paper, we
+propose TPDR, a two-step Transformer-based Product and Class Description
+Retrieval method that is able to explore the semantic correspondence between IS
+and SD, by exploiting attention mechanisms and contrastive learning. First,
+TPDR employs the transformers as two encoders sharing the embedding vector
+space: one for encoding the IS and another for the SD, in which corresponding
+pairs (IS, SD) must be close in the vector space. Closeness is further enforced
+by a contrastive learning mechanism leveraging a specialized loss function.
+TPDR also exploits a (second) re-ranking step based on syntactic features that
+are very important for the exact matching (model, dimension) of certain
+products that may have been neglected by the transformers. To evaluate our
+proposal, we consider 11 datasets from a real company, covering different
+application contexts. Our solution was able to retrieve the correct
+standardized product before the 5th ranking position in 71% of the cases and
+its correct category in the first position in 80% of the situations. Moreover,
+the effectiveness gains over purely syntactic or semantic baselines reach up to
+3.7 times, solving cases that none of the approaches in isolation can do by
+themselves.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Kollias, Karanjot Vendal, Priyanka Gadhavi, Solomon Russom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors pose significant health challenges worldwide, with glioblastoma
+being one of the most aggressive forms. Accurate determination of the
+O6-methylguanine-DNA methyltransferase (MGMT) promoter methylation status is
+crucial for personalized treatment strategies. However, traditional methods are
+labor-intensive and time-consuming. This paper proposes a novel multi-modal
+approach, BTDNet, leveraging multi-parametric MRI scans, including FLAIR, T1w,
+T1wCE, and T2 3D volumes, to predict MGMT promoter methylation status. BTDNet
+addresses two main challenges: the variable volume lengths (i.e., each volume
+consists of a different number of slices) and the volume-level annotations
+(i.e., the whole 3D volume is annotated and not the independent slices that it
+consists of). BTDNet consists of four components: i) the data augmentation one
+(that performs geometric transformations, convex combinations of data pairs and
+test-time data augmentation); ii) the 3D analysis one (that performs global
+analysis through a CNN-RNN); iii) the routing one (that contains a mask layer
+that handles variable input feature lengths), and iv) the modality fusion one
+(that effectively enhances data representation, reduces ambiguities and
+mitigates data scarcity). The proposed method outperforms by large margins the
+state-of-the-art methods in the RSNA-ASNR-MICCAI BraTS 2021 Challenge, offering
+a promising avenue for enhancing brain tumor diagnosis and treatment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Geometric Structure of Fully-Connected ReLU-Layers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonatan Vallin, Karl Larsson, Mats G. Larson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We formalize and interpret the geometric structure of $d$-dimensional fully
+connected ReLU-layers in neural networks. The parameters of a ReLU-layer induce
+a natural partition of the input domain, such that in each sector of the
+partition, the ReLU-layer can be greatly simplified. This leads to a geometric
+interpretation of a ReLU-layer as a projection onto a polyhedral cone followed
+by an affine transformation, in line with the description in
+[doi:10.48550/arXiv.1905.08922] for convolutional networks with ReLU
+activations. Further, this structure facilitates simplified expressions for
+preimages of the intersection between partition sectors and hyperplanes, which
+is useful when describing decision boundaries in a classification setting. We
+investigate this in detail for a feed-forward network with one hidden
+ReLU-layer, where we provide results on the geometric complexity of the
+decision boundary generated by such networks, as well as proving that modulo an
+affine transformation, such a network can only generate $d$ different decision
+boundaries. Finally, the effect of adding more layers to the network is
+discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Cadenza ICASSP 2024 Grand Challenge <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerardo Roa Dabike, Michael A. Akeroyd, Scott Bannister, Jon Barker, Trevor J. Cox, Bruno Fazenda, Jennifer Firth, Simone Graetzer, Alinka Greasley, Rebecca Vos, William Whitmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Cadenza project aims to enhance the audio quality of music for
+individuals with hearing loss. As part of this, the project is organizing the
+ICASSP SP Cadenza Challenge: Music Demixing/Remixing for Hearing Aids. The
+challenge can be tackled by decomposing the music at the hearing aid
+microphones into vocals, bass, drums, and other components. These can then be
+intelligently remixed in a personalized manner to improve audio quality.
+Alternatively, an end-to-end approach could be used. Processes need to consider
+the music itself, the gain applied to each component, and the listener's
+hearing loss. The submitted entries will be evaluated using the intrusive
+objective metric, the Hearing Aid Audio Quality Index (HAAQI). This paper
+outlines the challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 pages paper for ICASSP 2024 SP Grand Challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Blame Problem in Evaluating Local Explanations, and How to Tackle it <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Hossein Akhavan Rahnama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The number of local model-agnostic explanation techniques proposed has grown
+rapidly recently. One main reason is that the bar for developing new
+explainability techniques is low due to the lack of optimal evaluation
+measures. Without rigorous measures, it is hard to have concrete evidence of
+whether the new explanation techniques can significantly outperform their
+predecessors. Our study proposes a new taxonomy for evaluating local
+explanations: robustness, evaluation using ground truth from synthetic datasets
+and interpretable models, model randomization, and human-grounded evaluation.
+Using this proposed taxonomy, we highlight that all categories of evaluation
+methods, except those based on the ground truth from interpretable models,
+suffer from a problem we call the "blame problem." In our study, we argue that
+this category of evaluation measure is a more reasonable method for evaluating
+local model-agnostic explanations. However, we show that even this category of
+evaluation measures has further limitations. The evaluation of local
+explanations remains an open research problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Workshop: XAI methods, challenges and applications, 26th
+  European Conference on Artificial Intelligence (ECAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Which mode is better for federated learning? Centralized or
+  Decentralized 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Sun, Li Shen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Both centralized and decentralized approaches have shown excellent
+performance and great application value in federated learning (FL). However,
+current studies do not provide sufficient evidence to show which one performs
+better. Although from the optimization perspective, decentralized methods can
+approach the comparable convergence of centralized methods with less
+communication, its test performance has always been inefficient in empirical
+studies. To comprehensively explore their behaviors in FL, we study their
+excess risks, including the joint analysis of both optimization and
+generalization. We prove that on smooth non-convex objectives, 1) centralized
+FL (CFL) always generalizes better than decentralized FL (DFL); 2) from
+perspectives of the excess risk and test error in CFL, adopting partial
+participation is superior to full participation; and, 3) there is a necessary
+requirement for the topology in DFL to avoid performance collapse as the
+training scale increases. Based on some simple hardware metrics, we could
+evaluate which framework is better in practice. Extensive experiments are
+conducted on common setups in FL to validate that our theoretical analysis is
+contextually valid in practical scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Resolution Audio-Visual Feature Fusion for Temporal Action
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Fish, Jon Weinbren, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Action Localization (TAL) aims to identify actions' start, end, and
+class labels in untrimmed videos. While recent advancements using transformer
+networks and Feature Pyramid Networks (FPN) have enhanced visual feature
+recognition in TAL tasks, less progress has been made in the integration of
+audio features into such frameworks. This paper introduces the Multi-Resolution
+Audio-Visual Feature Fusion (MRAV-FF), an innovative method to merge
+audio-visual data across different temporal resolutions. Central to our
+approach is a hierarchical gated cross-attention mechanism, which discerningly
+weighs the importance of audio information at diverse temporal scales. Such a
+technique not only refines the precision of regression boundaries but also
+bolsters classification confidence. Importantly, MRAV-FF is versatile, making
+it compatible with existing FPN TAL architectures and offering a significant
+enhancement in performance when audio data is available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLAIM: AIM-based Synthetic Data Generation in the Federated Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Maddock, Graham Cormode, Carsten Maple
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preserving individual privacy while enabling collaborative data sharing is
+crucial for organizations. Synthetic data generation is one solution, producing
+artificial data that mirrors the statistical properties of private data. While
+numerous techniques have been devised under differential privacy, they
+predominantly assume data is centralized. However, data is often distributed
+across multiple clients in a federated manner. In this work, we initiate the
+study of federated synthetic tabular data generation. Building upon a SOTA
+central method known as AIM, we present DistAIM and FLAIM. We show it is
+straightforward to distribute AIM, extending a recent approach based on secure
+multi-party computation which necessitates additional overhead, making it less
+suited to federated scenarios. We then demonstrate that naively federating AIM
+can lead to substantial degradation in utility under the presence of
+heterogeneity. To mitigate both issues, we propose an augmented FLAIM approach
+that maintains a private proxy of heterogeneity. We simulate our methods across
+a range of benchmark datasets under different degrees of heterogeneity and show
+this can improve utility while reducing overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Inference for GARCH-family Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Magris, Alexandros Iosifidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Bayesian estimation of GARCH-family models has been typically addressed
+through Monte Carlo sampling. Variational Inference is gaining popularity and
+attention as a robust approach for Bayesian inference in complex machine
+learning models; however, its adoption in econometrics and finance is limited.
+This paper discusses the extent to which Variational Inference constitutes a
+reliable and feasible alternative to Monte Carlo sampling for Bayesian
+inference in GARCH-like models. Through a large-scale experiment involving the
+constituents of the S&P 500 index, several Variational Inference optimizers, a
+variety of volatility models, and a case study, we show that Variational
+Inference is an attractive, remarkably well-calibrated, and competitive method
+for Bayesian learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Language Model Pruning for Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Emili, Thiago Fraga-Silva, Ernest Pusateri, Markus Nußbaum-Thom, Youssef Oualil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study model pruning methods applied to Transformer-based neural network
+language models for automatic speech recognition. We explore three aspects of
+the pruning frame work, namely criterion, method and scheduler, analyzing their
+contribution in terms of accuracy and inference speed. To the best of our
+knowledge, such in-depth analyses on large-scale recognition systems has not
+been reported in the literature. In addition, we propose a variant of low-rank
+approximation suitable for incrementally compressing models, and delivering
+multiple models with varied target sizes. Among other results, we show that a)
+data-driven pruning outperforms magnitude-driven in several scenarios; b)
+incremental pruning achieves higher accuracy compared to one-shot pruning,
+especially when targeting smaller sizes; and c) low-rank approximation presents
+the best trade-off between size reduction and inference speed-up for moderate
+compression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> <span class="highlight-title">Pre-Train</span>ing and Fine-Tuning Generative Flow Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Pan, Moksh Jain, Kanika Madan, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Flow Networks (GFlowNets) are amortized samplers that learn
+stochastic policies to sequentially generate compositional objects from a given
+unnormalized reward distribution. They can generate diverse sets of high-reward
+objects, which is an important consideration in scientific discovery tasks.
+However, as they are typically trained from a given extrinsic reward function,
+it remains an important open challenge about how to leverage the power of
+pre-training and train GFlowNets in an unsupervised fashion for efficient
+adaptation to downstream tasks. Inspired by recent successes of unsupervised
+pre-training in various domains, we introduce a novel approach for reward-free
+pre-training of GFlowNets. By framing the training as a self-supervised
+problem, we propose an outcome-conditioned GFlowNet (OC-GFN) that learns to
+explore the candidate space. Specifically, OC-GFN learns to reach any targeted
+outcomes, akin to goal-conditioned policies in reinforcement learning. We show
+that the pre-trained OC-GFN model can allow for a direct extraction of a policy
+capable of sampling from any new reward functions in downstream tasks.
+Nonetheless, adapting OC-GFN on a downstream task-specific reward involves an
+intractable marginalization over possible outcomes. We propose a novel way to
+approximate this marginalization by learning an amortized predictor enabling
+efficient fine-tuning. Extensive experimental results validate the efficacy of
+our approach, demonstrating the effectiveness of pre-training the OC-GFN, and
+its ability to swiftly adapt to downstream tasks and discover modes more
+efficiently. This work may serve as a foundation for further exploration of
+pre-training strategies in the context of GFlowNets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Over-the-Air Federated Learning with Compressed Sensing: Is
+  Sparsification Necessary? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Edin, Zheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over-the-Air (OtA) Federated Learning (FL) refers to an FL system where
+multiple agents apply OtA computation for transmitting model updates to a
+common edge server. Two important features of OtA computation, namely linear
+processing and signal-level superposition, motivate the use of linear
+compression with compressed sensing (CS) methods to reduce the number of data
+samples transmitted over the channel. The previous works on applying CS methods
+in OtA FL have primarily assumed that the original model update vectors are
+sparse, or they have been sparsified before compression. However, it is unclear
+whether linear compression with CS-based reconstruction is more effective than
+directly sending the non-zero elements in the sparsified update vectors, under
+the same total power constraint. In this study, we examine and compare several
+communication designs with or without sparsification. Our findings demonstrate
+that sparsification before compression is not necessary. Alternatively,
+sparsification without linear compression can also achieve better performance
+than the commonly considered setup that combines both.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, submitted for possible conference publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RUSOpt: Robotic UltraSound Probe Normalization with Bayesian
+  Optimization for In-plane and Out-plane Scanning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deepak Raina, Abhishek Mathur, Richard M. Voyles, Juan Wachs, SH Chandrashekhara, Subir Kumar Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The one of the significant challenges faced by autonomous robotic ultrasound
+systems is acquiring high-quality images across different patients. The proper
+orientation of the robotized probe plays a crucial role in governing the
+quality of ultrasound images. To address this challenge, we propose a
+sample-efficient method to automatically adjust the orientation of the
+ultrasound probe normal to the point of contact on the scanning surface,
+thereby improving the acoustic coupling of the probe and resulting image
+quality. Our method utilizes Bayesian Optimization (BO) based search on the
+scanning surface to efficiently search for the normalized probe orientation. We
+formulate a novel objective function for BO that leverages the contact force
+measurements and underlying mechanics to identify the normal. We further
+incorporate a regularization scheme in BO to handle the noisy objective
+function. The performance of the proposed strategy has been assessed through
+experiments on urinary bladder phantoms. These phantoms included planar,
+tilted, and rough surfaces, and were examined using both linear and convex
+probes with varying search space limits. Further, simulation-based studies have
+been carried out using 3D human mesh models. The results demonstrate that the
+mean ($\pm$SD) absolute angular error averaged over all phantoms and 3D models
+is $\boldsymbol{2.4\pm0.7^\circ}$ and $\boldsymbol{2.1\pm1.3^\circ}$,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Automation Science and
+  Engineering (CASE) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EAG-RS: A Novel Explainability-guided ROI-Selection Framework for ASD
+  Diagnosis via Inter-regional Relation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonsik Jung, Eunjin Jeon, Eunsong Kang, Heung-Il Suk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models based on resting-state functional magnetic resonance
+imaging (rs-fMRI) have been widely used to diagnose brain diseases,
+particularly autism spectrum disorder (ASD). Existing studies have leveraged
+the functional connectivity (FC) of rs-fMRI, achieving notable classification
+performance. However, they have significant limitations, including the lack of
+adequate information while using linear low-order FC as inputs to the model,
+not considering individual characteristics (i.e., different symptoms or varying
+stages of severity) among patients with ASD, and the non-explainability of the
+decision process. To cover these limitations, we propose a novel
+explainability-guided region of interest (ROI) selection (EAG-RS) framework
+that identifies non-linear high-order functional associations among brain
+regions by leveraging an explainable artificial intelligence technique and
+selects class-discriminative regions for brain disease identification. The
+proposed framework includes three steps: (i) inter-regional relation learning
+to estimate non-linear relations through random seed-based network masking,
+(ii) explainable connection-wise relevance score estimation to explore
+high-order relations between functional connections, and (iii) non-linear
+high-order FC-based diagnosis-informative ROI selection and classifier learning
+to identify ASD. We validated the effectiveness of our proposed method by
+conducting experiments using the Autism Brain Imaging Database Exchange (ABIDE)
+dataset, demonstrating that the proposed method outperforms other comparative
+methods in terms of various evaluation metrics. Furthermore, we qualitatively
+analyzed the selected ROIs and identified ASD subtypes linked to previous
+neuroscientific studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Large Language Models for Content Moderation: Pitfalls in Data
+  Engineering and Supervised Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Ma, Changqing Zhang, Huazhu Fu, Peilin Zhao, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, billions of people engage in communication and express their
+opinions on the internet daily. Unfortunately, not all of these expressions are
+friendly or compliant, making content moderation an indispensable task. With
+the successful development of Large Language Models (LLMs) in recent years,
+LLM-based methods have become a feasible solution for handling tasks in various
+domains. However, in the field of content moderation, there is still a lack of
+detailed work that systematically introduces implementation details. In this
+paper, we introduce how to fine-tune an LLM model that can be privately
+deployed for content moderation. Specifically, we discuss whether incorporating
+reasons during the fine-tuning process would be better or if it should be
+treated as a classification task directly. We also explore the benefits of
+utilizing reasons generated by more powerful LLMs for fine-tuning privately
+deployed models and the impact of different processing approaches when the
+answers generated by the more powerful LLMs are incorrect. We report the entire
+research process and the key findings in this paper, hoping to provide valuable
+experience for researchers who are fine-tuning privately deployed models in
+their domain-specific research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRAPES: Learning to Sample Graphs for Scalable Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taraneh Younesian, Thiviyan Thanapalasingam, Emile van Krieken, Daniel Daza, Peter Bloem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) learn the representation of nodes in a graph by
+aggregating the neighborhood information in various ways. As these networks
+grow in depth, their receptive field grows exponentially due to the increase in
+neighborhood sizes, resulting in high memory costs. Graph sampling solves
+memory issues in GNNs by sampling a small ratio of the nodes in the graph. This
+way, GNNs can scale to much larger graphs. Most sampling methods focus on fixed
+sampling heuristics, which may not generalize to different structures or tasks.
+We introduce GRAPES, an adaptive graph sampling method that learns to identify
+sets of influential nodes for training a GNN classifier. GRAPES uses a GFlowNet
+to learn node sampling probabilities given the classification objectives. We
+evaluate GRAPES across several small- and large-scale graph benchmarks and
+demonstrate its effectiveness in accuracy and scalability. In contrast to
+existing sampling methods, GRAPES maintains high accuracy even with small
+sample sizes and, therefore, can scale to very large graphs. Our code is
+publicly available at https://github.com/dfdazac/grapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 appendix, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpolating between Clustering and Dimensionality Reduction with
+  Gromov-Wasserstein 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugues Van Assel, Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Nicolas Courty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a versatile adaptation of existing dimensionality reduction (DR)
+objectives, enabling the simultaneous reduction of both sample and feature
+sizes. Correspondances between input and embedding samples are computed through
+a semi-relaxed Gromov-Wasserstein optimal transport (OT) problem. When the
+embedding sample size matches that of the input, our model recovers classical
+popular DR models. When the embedding's dimensionality is unconstrained, we
+show that the OT plan delivers a competitive hard clustering. We emphasize the
+importance of intermediate stages that blend DR and clustering for summarizing
+real data and apply our method to visualize datasets of images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Simplify Spatial-Temporal Graphs in Gait Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Cosma, Emilian Radoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait analysis leverages unique walking patterns for person identification and
+assessment across multiple domains. Among the methods used for gait analysis,
+skeleton-based approaches have shown promise due to their robust and
+interpretable features. However, these methods often rely on hand-crafted
+spatial-temporal graphs that are based on human anatomy disregarding the
+particularities of the dataset and task. This paper proposes a novel method to
+simplify the spatial-temporal graph representation for gait-based gender
+estimation, improving interpretability without losing performance. Our approach
+employs two models, an upstream and a downstream model, that can adjust the
+adjacency matrix for each walking instance, thereby removing the fixed nature
+of the graph. By employing the Straight-Through Gumbel-Softmax trick, our model
+is trainable end-to-end. We demonstrate the effectiveness of our approach on
+the CASIA-B dataset for gait-based gender estimation. The resulting graphs are
+interpretable and differ qualitatively from fixed graphs used in existing
+models. Our research contributes to enhancing the explainability and
+task-specific adaptability of gait recognition, promoting more efficient and
+reliable gait-based biometrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 Figures, 1 Table. Short Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty quantification for deep learning-based schemes for solving
+  high-dimensional backward stochastic differential equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenc Kapllani, Long Teng, Matthias Rottmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based numerical schemes for solving high-dimensional backward
+stochastic differential equations (BSDEs) have recently raised plenty of
+scientific interest. While they enable numerical methods to approximate very
+high-dimensional BSDEs, their reliability has not been studied and is thus not
+understood. In this work, we study uncertainty quantification (UQ) for a class
+of deep learning-based BSDE schemes. More precisely, we review the sources of
+uncertainty involved in the schemes and numerically study the impact of
+different sources. Usually, the standard deviation (STD) of the approximate
+solutions obtained from multiple runs of the algorithm with different datasets
+is calculated to address the uncertainty. This approach is computationally
+quite expensive, especially for high-dimensional problems. Hence, we develop a
+UQ model that efficiently estimates the STD of the approximate solution using
+only a single run of the algorithm. The model also estimates the mean of the
+approximate solution, which can be leveraged to initialize the algorithm and
+improve the optimization process. Our numerical experiments show that the UQ
+model produces reliable estimates of the mean and STD of the approximate
+solution for the considered class of deep learning-based BSDE schemes. The
+estimated STD captures multiple sources of uncertainty, demonstrating its
+effectiveness in quantifying the uncertainty. Additionally, the model
+illustrates the improved performance when comparing different schemes based on
+the estimated STD values. Furthermore, it can identify hyperparameter values
+for which the scheme achieves good approximations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 23 figures and 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenPatch: a 3D patchwork for Out-Of-Distribution detectionpdf icon 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Rabino, Antonio Alliegro, Francesco Cappio Borlino, Tatiana Tommasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Moving deep learning models from the laboratory setting to the open world
+entails preparing them to handle unforeseen conditions. In several applications
+the occurrence of novel classes during deployment poses a significant threat,
+thus it is crucial to effectively detect them. Ideally, this skill should be
+used when needed without requiring any further computational training effort at
+every new task. Out-of-distribution detection has attracted significant
+attention in the last years, however the majority of the studies deal with 2D
+images ignoring the inherent 3D nature of the real-world and often confusing
+between domain and semantic novelty. In this work, we focus on the latter,
+considering the objects geometric structure captured by 3D point clouds
+regardless of the specific domain. We advance the field by introducing
+OpenPatch that builds on a large pre-trained model and simply extracts from its
+intermediate features a set of patch representations that describe each known
+class. For any new sample, we obtain a novelty score by evaluating whether it
+can be recomposed mainly by patches of a single known class or rather via the
+contribution of multiple classes. We present an extensive experimental
+evaluation of our approach for the task of semantic novelty detection on
+real-world point cloud samples when the reference known data are synthetic. We
+demonstrate that OpenPatch excels in both the full and few-shot known sample
+scenarios, showcasing its robustness across varying pre-training objectives and
+network backbones. The inherent training-free nature of our method allows for
+its immediate application to a wide array of real-world tasks, offering a
+compelling advantage over approaches that need expensive retraining efforts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine learning the interaction network in coupled dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03378v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03378v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pawan R. Bhure, M. S. Santhanam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study of interacting dynamical systems continues to attract research
+interest in various fields of science and engineering. In a collection of
+interacting particles, the interaction network contains information about how
+various components interact with one another. Inferring the information about
+the interaction network from the dynamics of agents is a problem of
+long-standing interest. In this work, we employ a self-supervised neural
+network model to achieve two outcomes: to recover the interaction network and
+to predict the dynamics of individual agents. Both these information are
+inferred solely from the observed trajectory data. This work presents an
+application of the Neural Relational Inference model to two dynamical systems:
+coupled particles mediated by Hooke's law interaction and coupled phase
+(Kuramoto) oscillators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video
+  Sequences Using Swin <span class="highlight-title">Transformer</span>-Enhanced UNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Jafari, Karim Faez, Hamidreza Amindavar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer is highly lethal, emphasizing the critical need for early
+detection. However, identifying lung nodules poses significant challenges for
+radiologists, who rely heavily on their expertise and experience for accurate
+diagnosis. To address this issue, computer-aided diagnosis systems based on
+machine learning techniques have emerged to assist doctors in identifying lung
+nodules from computed tomography (CT) scans. Unfortunately, existing networks
+in this domain often suffer from computational complexity, leading to high
+rates of false negatives and false positives, limiting their effectiveness. To
+address these challenges, we present an innovative model that harnesses the
+strengths of both convolutional neural networks and vision transformers.
+Inspired by object detection in videos, we treat each 3D CT image as a video,
+individual slices as frames, and lung nodules as objects, enabling a
+time-series application. The primary objective of our work is to overcome
+hardware limitations during model training, allowing for efficient processing
+of 2D data while utilizing inter-slice information for accurate identification
+based on 3D image context. We validated the proposed network by applying a
+10-fold cross-validation technique to the publicly available Lung Nodule
+Analysis 2016 dataset. Our proposed architecture achieves an average
+sensitivity criterion of 97.84% and a competition performance metrics (CPM) of
+96.0% with few parameters. Comparative analysis with state-of-the-art
+advancements in lung nodule identification demonstrates the significant
+accuracy achieved by our proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Representation Learning via Asymmetric Negative Contrast and
+  Reverse Attention <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuoyan Zhou, Decheng Liu, Dawei Zhou, Xinbo Gao, Nannan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to adversarial noise. Adversarial
+training (AT) has been demonstrated to be the most effective defense strategy
+to protect neural networks from being fooled. However, we find AT omits to
+learning robust features, resulting in poor performance of adversarial
+robustness. To address this issue, we highlight two characteristics of robust
+representation: (1) $\bf{exclusion}$: the feature of natural examples keeps
+away from that of other classes; (2) $\bf{alignment}$: the feature of natural
+and corresponding adversarial examples is close to each other. These motivate
+us to propose a generic framework of AT to gain robust representation, by the
+asymmetric negative contrast and reverse attention. Specifically, we design an
+asymmetric negative contrast based on predicted probabilities, to push away
+examples of different classes in the feature space. Moreover, we propose to
+weight feature by parameters of the linear classifier as the reverse attention,
+to obtain class-aware feature and pull close the feature of the same class.
+Empirical evaluations on three benchmark datasets show our methods greatly
+advance the robustness of AT and achieve state-of-the-art performance. Code is
+available at <https://github.com/changzhang777/ANCRA>.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fictitious Cross-Play: Learning Global Nash Equilibrium in Mixed
+  Cooperative-Competitive Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelai Xu, Yancheng Liang, Chao Yu, Yu Wang, Yi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-play (SP) is a popular multi-agent reinforcement learning (MARL)
+framework for solving competitive games, where each agent optimizes policy by
+treating others as part of the environment. Despite the empirical successes,
+the theoretical properties of SP-based methods are limited to two-player
+zero-sum games. However, for mixed cooperative-competitive games where agents
+on the same team need to cooperate with each other, we can show a simple
+counter-example where SP-based methods cannot converge to a global Nash
+equilibrium (NE) with high probability. Alternatively, Policy-Space Response
+Oracles (PSRO) is an iterative framework for learning NE, where the best
+responses w.r.t. previous policies are learned in each iteration. PSRO can be
+directly extended to mixed cooperative-competitive settings by jointly learning
+team best responses with all convergence properties unchanged. However, PSRO
+requires repeatedly training joint policies from scratch till convergence,
+which makes it hard to scale to complex games. In this work, we develop a novel
+algorithm, Fictitious Cross-Play (FXP), which inherits the benefits from both
+frameworks. FXP simultaneously trains an SP-based main policy and a counter
+population of best response policies. The main policy is trained by fictitious
+self-play and cross-play against the counter population, while the counter
+policies are trained as the best responses to the main policy's past versions.
+We validate our method in matrix games and show that FXP converges to global
+NEs while SP methods fail. We also conduct experiments in a gridworld domain,
+where FXP achieves higher Elo ratings and lower exploitabilities than
+baselines, and a more challenging football game, where FXP defeats SOTA models
+with over 94% win rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Geometric Learning with Monotonicity Constraints for Alzheimer's
+  Disease Progression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungwoo Jeong, Wonsik Jung, Junghyo Sohn, Heung-Il Suk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD) is a devastating neurodegenerative condition that
+precedes progressive and irreversible dementia; thus, predicting its
+progression over time is vital for clinical diagnosis and treatment. Numerous
+studies have implemented structural magnetic resonance imaging (MRI) to model
+AD progression, focusing on three integral aspects: (i) temporal variability,
+(ii) incomplete observations, and (iii) temporal geometric characteristics.
+However, deep learning-based approaches regarding data variability and sparsity
+have yet to consider inherent geometrical properties sufficiently. The ordinary
+differential equation-based geometric modeling method (ODE-RGRU) has recently
+emerged as a promising strategy for modeling time-series data by intertwining a
+recurrent neural network and an ODE in Riemannian space. Despite its
+achievements, ODE-RGRU encounters limitations when extrapolating positive
+definite symmetric metrics from incomplete samples, leading to feature reverse
+occurrences that are particularly problematic, especially within the clinical
+facet. Therefore, this study proposes a novel geometric learning approach that
+models longitudinal MRI biomarkers and cognitive scores by combining three
+modules: topological space shift, ODE-RGRU, and trajectory estimation. We have
+also developed a training algorithm that integrates manifold mapping with
+monotonicity constraints to reflect measurement transition irreversibility. We
+verify our proposed method's efficacy by predicting clinical labels and
+cognitive scores over time in regular and irregular settings. Furthermore, we
+thoroughly analyze our proposed framework through an ablation study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Integrated Algorithm for Robust and Imperceptible Audio Adversarial
+  Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armin Ettenhofer, Jan-Philipp Schulze, Karla Pizzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio adversarial examples are audio files that have been manipulated to fool
+an automatic speech recognition (ASR) system, while still sounding benign to a
+human listener. Most methods to generate such samples are based on a two-step
+algorithm: first, a viable adversarial audio file is produced, then, this is
+fine-tuned with respect to perceptibility and robustness. In this work, we
+present an integrated algorithm that uses psychoacoustic models and room
+impulse responses (RIR) in the generation step. The RIRs are dynamically
+created by a neural network during the generation process to simulate a
+physical environment to harden our examples against transformations experienced
+in over-the-air attacks. We compare the different approaches in three
+experiments: in a simulated environment and in a realistic over-the-air
+scenario to evaluate the robustness, and in a human study to evaluate the
+perceptibility. Our algorithms considering psychoacoustics only or in addition
+to the robustness show an improvement in the signal-to-noise ratio (SNR) as
+well as in the human perception study, at the cost of an increased word error
+rate (WER).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proc. 3rd Symposium on Security and Privacy in Speech Communication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LESSON: Learning to Integrate Exploration Strategies for Reinforcement
+  Learning via an Option Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woojun Kim, Jeonghye Kim, Youngchul Sung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a unified framework for exploration in reinforcement learning
+(RL) is proposed based on an option-critic model. The proposed framework learns
+to integrate a set of diverse exploration strategies so that the agent can
+adaptively select the most effective exploration strategy over time to realize
+a relevant exploration-exploitation trade-off for each given task. The
+effectiveness of the proposed exploration framework is demonstrated by various
+experiments in the MiniGrid and Atari environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Forecasting of Day-Ahead Electricity Prices and their
+  Volatility with LSTMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Trebbien, Sebastian Pütz, Benjamin Schäfer, Heidi S. Nygård, Leonardo Rydin Gorjão, Dirk Witthaut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate forecasts of electricity prices are crucial for the management of
+electric power systems and the development of smart applications. European
+electricity prices have risen substantially and became highly volatile after
+the Russian invasion of Ukraine, challenging established forecasting methods.
+Here, we present a Long Short-Term Memory (LSTM) model for the
+German-Luxembourg day-ahead electricity prices addressing these challenges. The
+recurrent structure of the LSTM allows the model to adapt to trends, while the
+joint prediction of both mean and standard deviation enables a probabilistic
+prediction. Using a physics-inspired approach - superstatistics - to derive an
+explanation for the statistics of prices, we show that the LSTM model
+faithfully reproduces both prices and their volatility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Untargeted White-box Adversarial Attack with Heuristic Defence Methods
+  in Real-time Deep Learning based Network Intrusion Detection System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khushnaseeb Roshan, Aasim Zafar, Sheikh Burhan Ul Haque
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network Intrusion Detection System (NIDS) is a key component in securing the
+computer network from various cyber security threats and network attacks.
+However, consider an unfortunate situation where the NIDS is itself attacked
+and vulnerable more specifically, we can say, How to defend the defender?. In
+Adversarial Machine Learning (AML), the malicious actors aim to fool the
+Machine Learning (ML) and Deep Learning (DL) models to produce incorrect
+predictions with intentionally crafted adversarial examples. These adversarial
+perturbed examples have become the biggest vulnerability of ML and DL based
+systems and are major obstacles to their adoption in real-time and
+mission-critical applications such as NIDS. AML is an emerging research domain,
+and it has become a necessity for the in-depth study of adversarial attacks and
+their defence strategies to safeguard the computer network from various cyber
+security threads. In this research work, we aim to cover important aspects
+related to NIDS, adversarial attacks and its defence mechanism to increase the
+robustness of the ML and DL based NIDS. We implemented four powerful
+adversarial attack techniques, namely, Fast Gradient Sign Method (FGSM),
+Jacobian Saliency Map Attack (JSMA), Projected Gradient Descent (PGD) and
+Carlini & Wagner (C&W) in NIDS. We analyzed its performance in terms of various
+performance metrics in detail. Furthermore, the three heuristics defence
+strategies, i.e., Adversarial Training (AT), Gaussian Data Augmentation (GDA)
+and High Confidence (HC), are implemented to improve the NIDS robustness under
+adversarial attack situations. The complete workflow is demonstrated in
+real-time network with data packet flow. This research work provides the
+overall background for the researchers interested in AML and its implementation
+from a computer network security point of view.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-tune Language Models to Approximate Unbiased In-context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Chu, Zhao Song, Chiwun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) is an astonishing emergent ability of large
+language models (LLMs). By presenting a prompt that includes multiple
+input-output pairs as examples and introducing a new query input, models can
+generate the corresponding output. However, the performance of models heavily
+relies on the quality of the input prompt when implementing in-context
+learning. Biased or imbalanced input prompts can significantly degrade the
+performance of language models. To address this issue, we introduce a
+reweighted algorithm called RICL (Reweighted In-context Learning). This
+algorithm fine-tunes language models using an unbiased validation set to
+determine the optimal weight for each input-output example to approximate
+unbiased in-context learning. Furthermore, we also introduce a low-cost
+reweighted algorithm, a linear optimal weight approximation algorithm called
+LARICL (Linear Approximation of Reweighted In-context Learning). This algorithm
+requires minimal training cost while providing effective results. We prove the
+convergence of our algorithm and validate its performance through experiments
+conducted on a numerical dataset. The experimental findings reveal a
+substantial improvement in comparison to benchmarks including the performance
+of casual prompt-based in-context learning and the performance of a classic
+fine-tuning method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Concept-Based Visual Causal Transition and Symbolic Reasoning
+  for Visual Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilue Qian, Peiyu Yu, Ying Nian Wu, Wei Wang, Lifeng Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual planning simulates how humans make decisions to achieve desired goals
+in the form of searching for visual causal transitions between an initial
+visual state and a final visual goal state. It has become increasingly
+important in egocentric vision with its advantages in guiding agents to perform
+daily tasks in complex environments. In this paper, we propose an interpretable
+and generalizable visual planning framework consisting of i) a novel
+Substitution-based Concept Learner (SCL) that abstracts visual inputs into
+disentangled concept representations, ii) symbol abstraction and reasoning that
+performs task planning via the self-learned symbols, and iii) a Visual Causal
+Transition model (ViCT) that grounds visual causal transitions to semantically
+similar real-world actions. Given an initial state, we perform goal-conditioned
+visual planning with a symbolic reasoning method fueled by the learned
+representations and causal transitions to reach the goal state. To verify the
+effectiveness of the proposed model, we collect a large-scale visual planning
+dataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this
+challenging dataset demonstrate the superior performance of our method in
+visual task planning. Empirically, we show that our framework can generalize to
+unseen task trajectories and unseen object categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Limitation of CLIP Models: The Worst-Performing
+  Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie-Jing Shao, Jiang-Xin Shi, Xiao-Wen Yang, Lan-Zhe Guo, Yu-Feng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) provides a foundation model by
+integrating natural language into visual concepts, enabling zero-shot
+recognition on downstream tasks. It is usually expected that satisfactory
+overall accuracy can be achieved across numerous domains through well-designed
+textual prompts. However, we found that their performance in the worst
+categories is significantly inferior to the overall performance. For example,
+on ImageNet, there are a total of 10 categories with class-wise accuracy as low
+as 0\%, even though the overall performance has achieved 64.1\%. This
+phenomenon reveals the potential risks associated with using CLIP models,
+particularly in risk-sensitive applications where specific categories hold
+significant importance. To address this issue, we investigate the alignment
+between the two modalities in the CLIP model and propose the Class-wise
+Matching Margin (\cmm) to measure the inference confusion. \cmm\ can
+effectively identify the worst-performing categories and estimate the potential
+performance of the candidate prompts. We further query large language models to
+enrich descriptions of worst-performing categories and build a weighted
+ensemble to highlight the efficient prompts. Experimental results clearly
+verify the effectiveness of our proposal, where the accuracy on the worst-10
+categories on ImageNet is boosted to 5.2\%, without manual prompt engineering,
+laborious optimization, or access to labeled validation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifeng Wang, Zichen Wang, Balasubramaniam Srinivasan, Vassilis N. Ioannidis, Huzefa Rangwala, Rishita Anubhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are able to leverage large volumes of unlabeled data
+to demonstrate superior performance across a wide range of tasks. However, FMs
+developed for biomedical domains have largely remained unimodal, i.e.,
+independently trained and used for tasks on protein sequences alone, small
+molecule structures alone, or clinical data alone. To overcome this limitation
+of biomedical FMs, we present BioBridge, a novel parameter-efficient learning
+framework, to bridge independently trained unimodal FMs to establish multimodal
+behavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn
+transformations between one unimodal FM and another without fine-tuning any
+underlying unimodal FMs. Our empirical results demonstrate that BioBridge can
+beat the best baseline KG embedding methods (on average by around 76.3%) in
+cross-modal retrieval tasks. We also identify BioBridge demonstrates
+out-of-domain generalization ability by extrapolating to unseen modalities or
+relations. Additionally, we also show that BioBridge presents itself as a
+general purpose retriever that can aid biomedical multimodal question answering
+as well as enhance the guided generation of novel drugs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Human-Robot Collaboration using Constrained Probabilistic
+  Human-Motion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aadi Kothari, Tony Tohme, Xiaotong Zhang, Kamal Youcef-Toumi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction is an essential step for efficient and safe
+human-robot collaboration. Current methods either purely rely on representing
+the human joints in some form of neural network-based architecture or use
+regression models offline to fit hyper-parameters in the hope of capturing a
+model encompassing human motion. While these methods provide good initial
+results, they are missing out on leveraging well-studied human body kinematic
+models as well as body and scene constraints which can help boost the efficacy
+of these prediction frameworks while also explicitly avoiding implausible human
+joint configurations. We propose a novel human motion prediction framework that
+incorporates human joint constraints and scene constraints in a Gaussian
+Process Regression (GPR) model to predict human motion over a set time horizon.
+This formulation is combined with an online context-aware constraints model to
+leverage task-dependent motions. It is tested on a human arm kinematic model
+and implemented on a human-robot collaborative setup with a UR5 robot arm to
+demonstrate the real-time capability of our approach. Simulations were also
+performed on datasets like HA4M and ANDY. The simulation and experimental
+results demonstrate considerable improvements in a Gaussian Process framework
+when these constraints are explicitly considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures. Associated video demonstration can be found at
+  https://www.youtube.com/@MITMechatronics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Certifiably Robust Graph Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhua Lin, Teng Xiao, Enyan Dai, Xiang Zhang, Suhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Contrastive Learning (GCL) has emerged as a popular unsupervised graph
+representation learning method. However, it has been shown that GCL is
+vulnerable to adversarial attacks on both the graph structure and node
+attributes. Although empirical approaches have been proposed to enhance the
+robustness of GCL, the certifiable robustness of GCL is still remain
+unexplored. In this paper, we develop the first certifiably robust framework in
+GCL. Specifically, we first propose a unified criteria to evaluate and certify
+the robustness of GCL. We then introduce a novel technique, RES (Randomized
+Edgedrop Smoothing), to ensure certifiable robustness for any GCL model, and
+this certified robustness can be provably preserved in downstream tasks.
+Furthermore, an effective training method is proposed for robust GCL. Extensive
+experiments on real-world datasets demonstrate the effectiveness of our
+proposed method in providing effective certifiable robustness and enhancing the
+robustness of any GCL model. The source code of RES is available at
+https://github.com/ventr1c/RES-GCL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Variational Multivariate Information Bottleneck -- A Framework for
+  Variational Losses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eslam Abdelaleem, Ilya Nemenman, K. Michael Martini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational dimensionality reduction methods are known for their high
+accuracy, generative abilities, and robustness. These methods have many
+theoretical justifications. Here we introduce a unifying principle rooted in
+information theory to rederive and generalize existing variational methods and
+design new ones. We base our framework on an interpretation of the multivariate
+information bottleneck, in which two Bayesian networks are traded off against
+one another. We interpret the first network as an encoder graph, which
+specifies what information to keep when compressing the data. We interpret the
+second network as a decoder graph, which specifies a generative model for the
+data. Using this framework, we rederive existing dimensionality reduction
+methods such as the deep variational information bottleneck (DVIB), beta
+variational auto-encoders (beta-VAE), and deep variational canonical
+correlation analysis (DVCCA). The framework naturally introduces a trade-off
+parameter between compression and reconstruction in the DVCCA family of
+algorithms, resulting in the new beta-DVCCA family. In addition, we derive a
+new variational dimensionality reduction method, deep variational symmetric
+informational bottleneck (DVSIB), which simultaneously compresses two variables
+to preserve information between their compressed representations. We implement
+all of these algorithms and evaluate their ability to produce shared low
+dimensional latent spaces on a modified noisy MNIST dataset. We show that
+algorithms that are better matched to the structure of the data (beta-DVCCA and
+DVSIB) produce better latent spaces as measured by classification accuracy and
+the dimensionality of the latent variables. We believe that this framework can
+be used to unify other multi-view representation learning algorithms.
+Additionally, it provides a straightforward framework for deriving
+problem-specific loss functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Large Language Models As AI Research Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03302v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03302v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Huang, Jian Vora, Percy Liang, Jure Leskovec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific experimentation involves an iterative process of creating
+hypotheses, designing experiments, running experiments, and analyzing the
+results. Can we build AI research agents to perform these long-horizon tasks?
+To take a step towards building and evaluating research agents on such
+open-ended decision-making tasks, we focus on the problem of machine learning
+engineering: given a task description and a dataset, build a high-performing
+model. In this paper, we propose MLAgentBench, a suite of ML tasks for
+benchmarking AI research agents. Agents can perform actions like
+reading/writing files, executing code, and inspecting outputs. With these
+actions, agents could run experiments, analyze the results, and modify the code
+of entire machine learning pipelines, such as data processing, architecture,
+training processes, etc. The benchmark then automatically evaluates the agent's
+performance objectively over various metrics related to performance and
+efficiency. We also design an LLM-based research agent to automatically perform
+experimentation loops in such an environment. Empirically, we find that a
+GPT-4-based research agent can feasibly build compelling ML models over many
+tasks in MLAgentBench, displaying highly interpretable plans and actions.
+However, the success rates vary considerably; they span from almost 90\% on
+well-established older datasets to as low as 10\% on recent Kaggle Challenges
+-- unavailable during the LLM model's pretraining -- and even 0\% on newer
+research challenges like BabyLM. Finally, we identify several key challenges
+for LLM-based research agents such as long-term planning and hallucination. Our
+code is released at https://github.com/snap-stanford/MLAgentBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spuriosity Rankings: Sorting Data to Measure and Mitigate Biases <span class="chip">NeurIPS '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazda Moayeri, Wenxiao Wang, Sahil Singla, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple but effective method to measure and mitigate model biases
+caused by reliance on spurious cues. Instead of requiring costly changes to
+one's data or model training, our method better utilizes the data one already
+has by sorting them. Specifically, we rank images within their classes based on
+spuriosity (the degree to which common spurious cues are present), proxied via
+deep neural features of an interpretable network. With spuriosity rankings, it
+is easy to identify minority subpopulations (i.e. low spuriosity images) and
+assess model bias as the gap in accuracy between high and low spuriosity
+images. One can even efficiently remove a model's bias at little cost to
+accuracy by finetuning its classification head on low spuriosity images,
+resulting in fairer treatment of samples regardless of spuriosity. We
+demonstrate our method on ImageNet, annotating $5000$ class-feature
+dependencies ($630$ of which we find to be spurious) and generating a dataset
+of $325k$ soft segmentations for these features along the way. Having computed
+spuriosity rankings via the identified spurious neural features, we assess
+biases for $89$ diverse models and find that class-wise biases are highly
+correlated across models. Our results suggest that model bias due to spurious
+feature reliance is influenced far more by what the model is trained on than
+how it is trained.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS '23 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IBCL: Zero-shot Model Generation for Task Trade-offs in Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyuan Lu, Michele Caprio, Eric Eaton, Insup Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Like generic multi-task learning, continual learning has the nature of
+multi-objective optimization, and therefore faces a trade-off between the
+performance of different tasks. That is, to optimize for the current task
+distribution, it may need to compromise performance on some previous tasks.
+This means that there exist multiple models that are Pareto-optimal at
+different times, each addressing a distinct task performance trade-off.
+Researchers have discussed how to train particular models to address specific
+trade-off preferences. However, existing algorithms require training overheads
+proportional to the number of preferences -- a large burden when there are
+multiple, possibly infinitely many, preferences. As a response, we propose
+Imprecise Bayesian Continual Learning (IBCL). Upon a new task, IBCL (1) updates
+a knowledge base in the form of a convex hull of model parameter distributions
+and (2) obtains particular models to address task trade-off preferences with
+zero-shot. That is, IBCL does not require any additional training overhead to
+generate preference-addressing models from its knowledge base. We show that
+models obtained by IBCL have guarantees in identifying the Pareto optimal
+parameters. Moreover, experiments on standard image classification and NLP
+tasks support this guarantee. Statistically, IBCL improves average per-task
+accuracy by at most 23\% and peak per-task accuracy by at most 15\% with
+respect to the baseline methods, with steadily near-zero or positive backward
+transfer. Most importantly, IBCL significantly reduces the training overhead
+from training 1 model per preference to at most 3 models for all preferences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Duplicate submission to arxiv</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Three-Way Trade-Off in Multi-Objective Learning: Optimization,
+  Generalization and Conflict-Avoidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.20057v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.20057v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lisha Chen, Heshan Fernando, Yiming Ying, Tianyi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective learning (MOL) problems often arise in emerging machine
+learning problems when there are multiple learning criteria, data modalities,
+or learning tasks. Different from single-objective learning, one of the
+critical challenges in MOL is the potential conflict among different objectives
+during the iterative optimization process. Recent works have developed various
+dynamic weighting algorithms for MOL such as MGDA and its variants, where the
+central idea is to find an update direction that avoids conflicts among
+objectives. Albeit its appealing intuition, empirical studies show that dynamic
+weighting methods may not always outperform static ones. To understand this
+theory-practical gap, we focus on a new stochastic variant of MGDA - the
+Multi-objective gradient with Double sampling (MoDo) algorithm, and study the
+generalization performance of the dynamic weighting-based MoDo and its
+interplay with optimization through the lens of algorithm stability. Perhaps
+surprisingly, we find that the key rationale behind MGDA -- updating along
+conflict-avoidant direction - may hinder dynamic weighting algorithms from
+achieving the optimal ${\cal O}(1/\sqrt{n})$ population risk, where $n$ is the
+number of training samples. We further demonstrate the impact of the
+variability of dynamic weights on the three-way trade-off among optimization,
+generalization, and conflict avoidance that is unique in MOL. We showcase the
+generality of our theoretical framework by analyzing other existing stochastic
+MOL algorithms under the framework. Experiments on various multi-task learning
+benchmarks are performed to demonstrate the practical applicability. Code is
+available at https://github.com/heshandevaka/Trade-Off-MOL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-Varying Propensity Score to Bridge the Gap between the Past and
+  Present 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01422v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01422v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rasool Fakoor, Jonas Mueller, Zachary C. Lipton, Pratik Chaudhari, Alexander J. Smola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world deployment of machine learning models is challenging because data
+evolves over time. While no model can work when data evolves in an arbitrary
+fashion, if there is some pattern to these changes, we might be able to design
+methods to address it. This paper addresses situations when data evolves
+gradually. We introduce a time-varying propensity score that can detect gradual
+shifts in the distribution of data which allows us to selectively sample past
+data to update the model -- not just similar data from the past like that of a
+standard propensity score but also data that evolved in a similar fashion in
+the past. The time-varying propensity score is quite general: we demonstrate
+different ways of implementing it and evaluate it on a variety of problems
+ranging from supervised learning (e.g., image classification problems) where
+data undergoes a sequence of gradual shifts, to reinforcement learning tasks
+(e.g., robotic manipulation and continuous control) where data shifts as the
+policy or the task changes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Understanding the Effect of <span class="highlight-title">Pretrain</span>ing Label Granularity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16887v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16887v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan Zhe Hong, Yin Cui, Ariel Fuxman, Stanley H. Chan, Enming Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study how the granularity of pretraining labels affects the
+generalization of deep neural networks in image classification tasks. We focus
+on the "fine-to-coarse" transfer learning setting, where the pretraining label
+space is more fine-grained than that of the target problem. Empirically, we
+show that pretraining on the leaf labels of ImageNet21k produces better
+transfer results on ImageNet1k than pretraining on other coarser granularity
+levels, which supports the common practice used in the community.
+Theoretically, we explain the benefit of fine-grained pretraining by proving
+that, for a data distribution satisfying certain hierarchy conditions, 1)
+coarse-grained pretraining only allows a neural network to learn the "common"
+or "easy-to-learn" features well, while 2) fine-grained pretraining helps the
+network learn the "rarer" or "fine-grained" features in addition to the common
+ones, thus improving its accuracy on hard downstream test samples in which
+common features are missing or weak in strength. Furthermore, we perform
+comprehensive experiments using the label hierarchies of iNaturalist 2021 and
+observe that the following conditions, in addition to proper choice of label
+granularity, enable the transfer to work well in practice: 1) the pretraining
+dataset needs to have a meaningful label hierarchy, and 2) the pretraining and
+target label functions need to align well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Generative Models for Simulation of EMG During Naturalistic
+  Movements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01856v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01856v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shihan Ma, Alexander Kenneth Clarke, Kostiantyn Maksymenko, Samuel Deslauriers-Gauthier, Xinjun Sheng, Xiangyang Zhu, Dario Farina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical models of electromyographic (EMG) signals have provided a huge
+contribution to our fundamental understanding of human neurophysiology and
+remain a central pillar of motor neuroscience and the development of
+human-machine interfaces. However, whilst modern biophysical simulations based
+on finite element methods are highly accurate, they are extremely
+computationally expensive and thus are generally limited to modelling static
+systems such as isometrically contracting limbs. As a solution to this problem,
+we propose a transfer learning approach, in which a conditional generative
+model is trained to mimic the output of an advanced numerical model. To this
+end, we present BioMime, a conditional generative neural network trained
+adversarially to generate motor unit activation potential waveforms under a
+wide variety of volume conductor parameters. We demonstrate the ability of such
+a model to predictively interpolate between a much smaller number of numerical
+model's outputs with a high accuracy. Consequently, the computational load is
+dramatically reduced, which allows the rapid simulation of EMG signals during
+truly dynamic and naturalistic movements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ECG-SL: Electrocardiogram(ECG) Segment Learning, a deep learning method
+  for ECG signal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00818v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00818v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Yu, Huiyuan Yang, Akane Sano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrocardiogram (ECG) is an essential signal in monitoring human heart
+activities. Researchers have achieved promising results in leveraging ECGs in
+clinical applications with deep learning models. However, the mainstream deep
+learning approaches usually neglect the periodic and formative attribute of the
+ECG heartbeat waveform. In this work, we propose a novel ECG-Segment based
+Learning (ECG-SL) framework to explicitly model the periodic nature of ECG
+signals. More specifically, ECG signals are first split into heartbeat
+segments, and then structural features are extracted from each of the segments.
+Based on the structural features, a temporal model is designed to learn the
+temporal information for various clinical tasks. Further, due to the fact that
+massive ECG signals are available but the labeled data are very limited, we
+also explore self-supervised learning strategy to pre-train the models,
+resulting significant improvement for downstream tasks. The proposed method
+outperforms the baseline model and shows competitive performances compared with
+task-specific methods in three clinical applications: cardiac condition
+diagnosis, sleep apnea detection, and arrhythmia classification. Further, we
+find that the ECG-SL tends to focus more on each heartbeat's peak and ST range
+than ResNet by visualizing the saliency maps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explaining Emergent In-Context Learning as Kernel Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Han, Ziqi Wang, Han Zhao, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have initiated a paradigm shift in transfer
+learning. In contrast to the classic pretraining-then-finetuning procedure, in
+order to use LLMs for downstream prediction tasks, one only needs to provide a
+few demonstrations, known as in-context examples, without adding more or
+updating existing model parameters. This in-context learning (ICL) capability
+of LLMs is intriguing, and it is not yet fully understood how pretrained LLMs
+acquire such capabilities. In this paper, we investigate the reason why a
+transformer-based language model can accomplish in-context learning after
+pre-training on a general language corpus by proposing one hypothesis that LLMs
+can simulate kernel regression with internal representations when faced with
+in-context examples. More concretely, we first prove that Bayesian inference on
+in-context prompts can be asymptotically understood as kernel regression $\hat
+y = \sum_i y_i K(x, x_i)/\sum_i K(x, x_i)$ as the number of in-context
+demonstrations grows. Then, we empirically investigate the in-context behaviors
+of language models. We find that during ICL, the attention and hidden features
+in LLMs match the behaviors of a kernel regression. Finally, our theory
+provides insights into multiple phenomena observed in the ICL field: why
+retrieving demonstrative samples similar to test samples can help, why ICL
+performance is sensitive to the output formats, and why ICL accuracy benefits
+from selecting in-distribution and representative samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Momentum Multi-Marginal Schrödinger Bridge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01751v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01751v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianrong Chen, Guan-Horng Liu, Molei Tao, Evangelos A. Theodorou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is a crucial challenge to reconstruct population dynamics using unlabeled
+samples from distributions at coarse time intervals. Recent approaches such as
+flow-based models or Schr\"odinger Bridge (SB) models have demonstrated
+appealing performance, yet the inferred sample trajectories either fail to
+account for the underlying stochasticity or are $\underline{D}$eep
+$\underline{M}$omentum Multi-Marginal $\underline{S}$chr\"odinger
+$\underline{B}$ridge(DMSB), a novel computational framework that learns the
+smooth measure-valued spline for stochastic systems that satisfy position
+marginal constraints across time. By tailoring the celebrated Bregman Iteration
+and extending the Iteration Proportional Fitting to phase space, we manage to
+handle high-dimensional multi-marginal trajectory inference tasks efficiently.
+Our algorithm outperforms baselines significantly, as evidenced by experiments
+for synthetic datasets and a real-world single-cell RNA sequence dataset.
+Additionally, the proposed approach can reasonably reconstruct the evolution of
+velocity distribution, from position snapshots only, when there is a ground
+truth velocity that is nevertheless inaccessible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharpness-Aware Minimization and the Edge of Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12488v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12488v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip M. Long, Peter L. Bartlett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent experiments have shown that, often, when training a neural network
+with gradient descent (GD) with a step size $\eta$, the operator norm of the
+Hessian of the loss grows until it approximately reaches $2/\eta$, after which
+it fluctuates around this value. The quantity $2/\eta$ has been called the
+"edge of stability" based on consideration of a local quadratic approximation
+of the loss. We perform a similar calculation to arrive at an "edge of
+stability" for Sharpness-Aware Minimization (SAM), a variant of GD which has
+been shown to improve its generalization. Unlike the case for GD, the resulting
+SAM-edge depends on the norm of the gradient. Using three deep learning
+training tasks, we see empirically that SAM operates on the edge of stability
+identified by this analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoding speech perception from non-invasive brain recordings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.12266v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.12266v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Défossez, Charlotte Caucheteux, Jérémy Rapin, Ori Kabeli, Jean-Rémi King
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoding speech from brain activity is a long-awaited goal in both healthcare
+and neuroscience. Invasive devices have recently led to major milestones in
+that regard: deep learning algorithms trained on intracranial recordings now
+start to decode elementary linguistic features (e.g. letters, words,
+spectrograms). However, extending this approach to natural speech and
+non-invasive brain recordings remains a major challenge. Here, we introduce a
+model trained with contrastive-learning to decode self-supervised
+representations of perceived speech from the non-invasive recordings of a large
+cohort of healthy individuals. To evaluate this approach, we curate and
+integrate four public datasets, encompassing 175 volunteers recorded with
+magneto- or electro-encephalography (M/EEG), while they listened to short
+stories and isolated sentences. The results show that our model can identify,
+from 3 seconds of MEG signals, the corresponding speech segment with up to 41%
+accuracy out of more than 1,000 distinct possibilities on average across
+participants, and more than 80% in the very best participants - a performance
+that allows the decoding of words and phrases absent from the training set. The
+comparison of our model to a variety of baselines highlights the importance of
+(i) a contrastive objective, (ii) pretrained representations of speech and
+(iii) a common convolutional architecture simultaneously trained across
+multiple participants. Finally, the analysis of the decoder's predictions
+suggests that they primarily depend on lexical and contextual semantic
+representations. Overall, this effective decoding of perceived speech from
+non-invasive recordings delineates a promising path to decode language from
+brain activity, without putting patients at risk for brain surgery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated version following publication in Nature Machine Intelligence
+  (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal 1-Wasserstein Distance for WGANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.02824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.02824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Stéphanovitch, Ugo Tanielian, Benoît Cadre, Nicolas Klutchnikoff, Gérard Biau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The mathematical forces at work behind Generative Adversarial Networks raise
+challenging theoretical issues. Motivated by the important question of
+characterizing the geometrical properties of the generated distributions, we
+provide a thorough analysis of Wasserstein GANs (WGANs) in both the finite
+sample and asymptotic regimes. We study the specific case where the latent
+space is univariate and derive results valid regardless of the dimension of the
+output space. We show in particular that for a fixed sample size, the optimal
+WGANs are closely linked with connected paths minimizing the sum of the squared
+Euclidean distances between the sample points. We also highlight the fact that
+WGANs are able to approach (for the 1-Wasserstein distance) the target
+distribution as the sample size tends to infinity, at a given convergence rate
+and provided the family of generative Lipschitz functions grows appropriately.
+We derive in passing new results on optimal transport theory in the
+semi-discrete setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantitative CLTs in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06092v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06092v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Favaro, Boris Hanin, Domenico Marinucci, Ivan Nourdin, Giovanni Peccati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the distribution of a fully connected neural network with random
+Gaussian weights and biases in which the hidden layer widths are proportional
+to a large constant $n$. Under mild assumptions on the non-linearity, we obtain
+quantitative bounds on normal approximations valid at large but finite $n$ and
+any fixed network depth. Our theorems show both for the finite-dimensional
+distributions and the entire process, that the distance between a random fully
+connected network (and its derivatives) to the corresponding infinite width
+Gaussian process scales like $n^{-\gamma}$ for $\gamma>0$, with the exponent
+depending on the metric used to measure discrepancy. Our bounds are strictly
+stronger in terms of their dependence on network width than any previously
+available in the literature; in the one-dimensional case, we also prove that
+they are optimal, i.e., we establish matching lower bounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Robustness of Interpretability Methods through
+  Explanation Invariance and Equivariance <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06715v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06715v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Crabbé, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability methods are valuable only if their explanations faithfully
+describe the explained model. In this work, we consider neural networks whose
+predictions are invariant under a specific symmetry group. This includes
+popular architectures, ranging from convolutional to graph neural networks. Any
+explanation that faithfully explains this type of model needs to be in
+agreement with this invariance property. We formalize this intuition through
+the notion of explanation invariance and equivariance by leveraging the
+formalism from geometric deep learning. Through this rigorous formalism, we
+derive (1) two metrics to measure the robustness of any interpretability method
+with respect to the model symmetry group; (2) theoretical robustness guarantees
+for some popular interpretability methods and (3) a systematic approach to
+increase the invariance of any interpretability method with respect to a
+symmetry group. By empirically measuring our metrics for explanations of models
+associated with various modalities and symmetry groups, we derive a set of 5
+guidelines to allow users and developers of interpretability methods to produce
+robust explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Convergence of Federated Averaging Langevin Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.05120v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.05120v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Deng, Qian Zhang, Yi-An Ma, Zhao Song, Guang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a federated averaging Langevin algorithm (FA-LD) for uncertainty
+quantification and mean predictions with distributed clients. In particular, we
+generalize beyond normal posterior distributions and consider a general class
+of models. We develop theoretical guarantees for FA-LD for strongly log-concave
+distributions with non-i.i.d data and study how the injected noise and the
+stochastic-gradient noise, the heterogeneity of data, and the varying learning
+rates affect the convergence. Such an analysis sheds light on the optimal
+choice of local updates to minimize communication costs. Important to our
+approach is that the communication efficiency does not deteriorate with the
+injected noise in the Langevin algorithms. In addition, we examine in our FA-LD
+algorithm both independent and correlated noise used over different clients. We
+observe there is a trade-off between the pairs among communication, accuracy,
+and data privacy. As local devices may become inactive in federated networks,
+we also show convergence results based on different averaging schemes where
+only partial device updates are available. In such a case, we discover an
+additional bias that does not decay to zero.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A polished proof without the federated formulation of Langevin
+  diffusion to avoid confusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algebraic and Geometric Models for Space Networking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01150v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01150v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Bernardoni, Robert Cardona, Jacob Cleveland, Justin Curry, Robert Green, Brian Heller, Alan Hylton, Tung Lam, Robert Kassouf-Short
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce some new algebraic and geometric perspectives on
+networked space communications. Our main contribution is a novel definition of
+a time-varying graph (TVG), defined in terms of a matrix with values in subsets
+of the real line P(R). We leverage semi-ring properties of P(R) to model
+multi-hop communication in a TVG using matrix multiplication and a truncated
+Kleene star. This leads to novel statistics on the communication capacity of
+TVGs called lifetime curves, which we generate for large samples of randomly
+chosen STARLINK satellites, whose connectivity is modeled over day-long
+simulations. Determining when a large subsample of STARLINK is temporally
+strongly connected is further analyzed using novel metrics introduced here that
+are inspired by topological data analysis (TDA). To better model networking
+scenarios between the Earth and Mars, we introduce various semi-rings capable
+of modeling propagation delay as well as protocols common to Delay Tolerant
+Networking (DTN), such as store-and-forward. Finally, we illustrate the
+applicability of zigzag persistence for featurizing different space networks
+and demonstrate the efficacy of K-Nearest Neighbors (KNN) classification for
+distinguishing Earth-Mars and Earth-Moon satellite systems using time-varying
+topology alone.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Figures updated and improved based on more exhaustive simulations.
+  Conjecture 2.27 now has weak and strong variations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Demystifying Oversmoothing in Attention-Based Graph Neural Networks <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Wu, Amir Ajorlou, Zihui Wu, Ali Jadbabaie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Oversmoothing in Graph Neural Networks (GNNs) refers to the phenomenon where
+increasing network depth leads to homogeneous node representations. While
+previous work has established that Graph Convolutional Networks (GCNs)
+exponentially lose expressive power, it remains controversial whether the graph
+attention mechanism can mitigate oversmoothing. In this work, we provide a
+definitive answer to this question through a rigorous mathematical analysis, by
+viewing attention-based GNNs as nonlinear time-varying dynamical systems and
+incorporating tools and techniques from the theory of products of inhomogeneous
+matrices and the joint spectral radius. We establish that, contrary to popular
+belief, the graph attention mechanism cannot prevent oversmoothing and loses
+expressive power exponentially. The proposed framework extends the existing
+results on oversmoothing for symmetric GCNs to a significantly broader class of
+GNN models, including random walk GCNs, Graph Attention Networks (GATs) and
+(graph) transformers. In particular, our analysis accounts for asymmetric,
+state-dependent and time-varying aggregation operators and a wide range of
+common nonlinear activation functions, such as ReLU, LeakyReLU, GELU and SiLU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 spotlight. New remarks added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Module-wise Training of Neural Networks via the Minimizing Movement
+  Scheme <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17357v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17357v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skander Karkar, Ibrahim Ayed, Emmanuel de Bézenac, Patrick Gallinari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Greedy layer-wise or module-wise training of neural networks is compelling in
+constrained and on-device settings where memory is limited, as it circumvents a
+number of problems of end-to-end back-propagation. However, it suffers from a
+stagnation problem, whereby early layers overfit and deeper layers stop
+increasing the test accuracy after a certain depth. We propose to solve this
+issue by introducing a module-wise regularization inspired by the minimizing
+movement scheme for gradient flows in distribution space. We call the method
+TRGL for Transport Regularized Greedy Learning and study it theoretically,
+proving that it leads to greedy modules that are regular and that progressively
+solve the task. Experimentally, we show improved accuracy of module-wise
+training of various architectures such as ResNets, Transformers and VGG, when
+our regularization is added, superior to that of other module-wise training
+methods and often to end-to-end training, with as much as 60% less memory
+usage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. arXiv admin note: text overlap with arXiv:2210.00949</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One-Versus-Others Attention: Scalable Multimodal Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Golovanevsky, Eva Schiller, Akira Nair, Ritambhara Singh, Carsten Eickhoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning models have become increasingly important as they surpass
+single-modality approaches on diverse tasks ranging from question-answering to
+autonomous driving. Despite the importance of multimodal learning, existing
+efforts focus on NLP applications, where the number of modalities is typically
+less than four (audio, video, text, images). However, data inputs in other
+domains, such as the medical field, may include X-rays, PET scans, MRIs,
+genetic screening, clinical notes, and more, creating a need for both efficient
+and accurate information fusion. Many state-of-the-art models rely on pairwise
+cross-modal attention, which does not scale well for applications with more
+than three modalities. For $n$ modalities, computing attention will result in
+$n \choose 2$ operations, potentially requiring considerable amounts of
+computational resources. To address this, we propose a new domain-neutral
+attention mechanism, One-Versus-Others (OvO) attention, that scales linearly
+with the number of modalities and requires only $n$ attention operations, thus
+offering a significant reduction in computational complexity compared to
+existing cross-modal attention algorithms. Using three diverse real-world
+datasets as well as an additional simulation experiment, we show that our
+method improves performance compared to popular fusion techniques while
+decreasing computation costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-temporal associations representation and application for process
+  monitoring using graph convolution neural network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.05250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.05250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Ren, Xiaojun Liang, Chunhua Yang, Zhiwen Chen, Weihua Gui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thank you very much for the attention and concern of colleagues and scholars
+in this work. With the comments and guidance of experts, editors, and
+reviewers, this work has been accepted for publishing in the journal "Process
+Safety and Environmental Protection". The theme of this paper relies on the
+Spatial-temporal associations of numerous variables in the same industrial
+processes, which refers to numerous variables obtained in dynamic industrial
+processes with Spatial-temporal correlation characteristics, i.e., these
+variables are not only highly correlated in time but also interrelated in
+space. To handle this problem, three key issues need to be well addressed:
+variable characteristics modeling and representation, graph network
+construction (temporal information), and graph characteristics perception. The
+first issue is implemented by assuming the data follows one improved Gaussian
+distribution, while the graph network can be defined by the monitoring
+variables and their edges which are calculated by their characteristics in
+time. Finally, these networks corresponding to process states at different
+times are fed into a graph convolutional neural network to implement graph
+classification to achieve process monitoring. A benchmark experiment (Tennessee
+Eastman chemical process) and one application study (cobalt purification from
+zinc solution) are employed to demonstrate the feasibility and applicability of
+this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Inferential Reproducibility of Machine Learning Research <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04054v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04054v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Hagmann, Philipp Meier, Stefan Riezler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliability of machine learning evaluation -- the consistency of observed
+evaluation scores across replicated model training runs -- is affected by
+several sources of nondeterminism which can be regarded as measurement noise.
+Current tendencies to remove noise in order to enforce reproducibility of
+research results neglect inherent nondeterminism at the implementation level
+and disregard crucial interaction effects between algorithmic noise factors and
+data properties. This limits the scope of conclusions that can be drawn from
+such experiments. Instead of removing noise, we propose to incorporate several
+sources of variance, including their interaction with data properties, into an
+analysis of significance and reliability of machine learning evaluation, with
+the aim to draw inferences beyond particular instances of trained models. We
+show how to use linear mixed effects models (LMEMs) to analyze performance
+evaluation scores, and to conduct statistical inference with a generalized
+likelihood ratio test (GLRT). This allows us to incorporate arbitrary sources
+of noise like meta-parameter variations into statistical significance testing,
+and to assess performance differences conditional on data properties.
+Furthermore, a variance component analysis (VCA) enables the analysis of the
+contribution of noise sources to overall variance and the computation of a
+reliability coefficient by the ratio of substantial to total variance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large-scale investigation of weakly-supervised deep learning for the
+  fine-grained semantic indexing of biomedical literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasios Nentidis, Thomas Chatzopoulos, Anastasia Krithara, Grigorios Tsoumakas, Georgios Paliouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: Semantic indexing of biomedical literature is usually done at the
+level of MeSH descriptors with several related but distinct biomedical concepts
+often grouped together and treated as a single topic. This study proposes a new
+method for the automated refinement of subject annotations at the level of MeSH
+concepts. Methods: Lacking labelled data, we rely on weak supervision based on
+concept occurrence in the abstract of an article, which is also enhanced by
+dictionary-based heuristics. In addition, we investigate deep learning
+approaches, making design choices to tackle the particular challenges of this
+task. The new method is evaluated on a large-scale retrospective scenario,
+based on concepts that have been promoted to descriptors. Results: In our
+experiments concept occurrence was the strongest heuristic achieving a macro-F1
+score of about 0.63 across several labels. The proposed method improved it
+further by more than 4pp. Conclusion: The results suggest that concept
+occurrence is a strong heuristic for refining the coarse-grained labels at the
+level of MeSH concepts and the proposed method improves it further.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 5 figures, 4 tables. A more concise version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling the Link Between Image Statistics and Human Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09874v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09874v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Hepburn, Valero Laparra, Raúl Santos-Rodriguez, Jesús Malo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the 1950s, Barlow and Attneave hypothesised a link between biological
+vision and information maximisation. Following Shannon, information was defined
+using the probability of natural images. A number of physiological and
+psychophysical phenomena have been derived ever since from principles like
+info-max, efficient coding, or optimal denoising. However, it remains unclear
+how this link is expressed in mathematical terms from image probability. First,
+classical derivations were subjected to strong assumptions on the probability
+models and on the behaviour of the sensors. Moreover, the direct evaluation of
+the hypothesis was limited by the inability of the classical image models to
+deliver accurate estimates of the probability. In this work we directly
+evaluate image probabilities using an advanced generative model for natural
+images, and we analyse how probability-related factors can be combined to
+predict human perception via sensitivity of state-of-the-art subjective image
+quality metrics. We use information theory and regression analysis to find a
+combination of just two probability-related factors that achieves 0.8
+correlation with subjective metrics. This probability-based sensitivity is
+psychophysically validated by reproducing the basic trends of the Contrast
+Sensitivity Function, its suprathreshold variation, and trends of the Weber-law
+and masking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Adversarial Robustness via Score-Based Optimization <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04333v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04333v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boya Zhang, Weijian Luo, Zhihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks have the potential to mislead deep neural network
+classifiers by introducing slight perturbations. Developing algorithms that can
+mitigate the effects of these attacks is crucial for ensuring the safe use of
+artificial intelligence. Recent studies have suggested that score-based
+diffusion models are effective in adversarial defenses. However, existing
+diffusion-based defenses rely on the sequential simulation of the reversed
+stochastic differential equations of diffusion models, which are
+computationally inefficient and yield suboptimal results. In this paper, we
+introduce a novel adversarial defense scheme named ScoreOpt, which optimizes
+adversarial samples at test-time, towards original clean data in the direction
+guided by score-based priors. We conduct comprehensive experiments on multiple
+datasets, including CIFAR10, CIFAR100 and ImageNet. Our experimental results
+demonstrate that our approach outperforms existing adversarial defenses in
+terms of both robustness performance and inference speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In Chat<span class="highlight-title">GPT</span> We Trust? Measuring and Characterizing the Reliability of
+  Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08979v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08979v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Shen, Zeyuan Chen, Michael Backes, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The way users acquire information is undergoing a paradigm shift with the
+advent of ChatGPT. Unlike conventional search engines, ChatGPT retrieves
+knowledge from the model itself and generates answers for users. ChatGPT's
+impressive question-answering (QA) capability has attracted more than 100
+million users within a short period of time but has also raised concerns
+regarding its reliability. In this paper, we perform the first large-scale
+measurement of ChatGPT's reliability in the generic QA scenario with a
+carefully curated set of 5,695 questions across ten datasets and eight domains.
+We find that ChatGPT's reliability varies across different domains, especially
+underperforming in law and science questions. We also demonstrate that system
+roles, originally designed by OpenAI to allow users to steer ChatGPT's
+behavior, can impact ChatGPT's reliability in an imperceptible way. We further
+show that ChatGPT is vulnerable to adversarial examples, and even a single
+character change can negatively affect its reliability in certain cases. We
+believe that our study provides valuable insights into ChatGPT's reliability
+and underscores the need for strengthening the reliability and security of
+large language models (LLMs).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative models for two-ground-truth partitions in networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02787v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02787v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lena Mangold, Camille Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A myriad of approaches have been proposed to characterise the mesoscale
+structure of networks - most often as a partition based on patterns variously
+called communities, blocks, or clusters. Clearly, distinct methods designed to
+detect different types of patterns may provide a variety of answers to the
+network's mesoscale structure. Yet, even multiple runs of a given method can
+sometimes yield diverse and conflicting results, producing entire landscapes of
+partitions which potentially include multiple (locally optimal) mesoscale
+explanations of the network. Such ambiguity motivates a closer look at the
+ability of these methods to find multiple qualitatively different 'ground
+truth' partitions in a network. Here, we propose the stochastic cross-block
+model (SCBM), a generative model which allows for two distinct partitions to be
+built into the mesoscale structure of a single benchmark network. We
+demonstrate a use case of the benchmark model by appraising the power of
+stochastic block models (SBMs) to detect implicitly planted coexisting
+bi-community and core-periphery structures of different strengths. Given our
+model design and experimental set-up, we find that the ability to detect the
+two partitions individually varies by SBM variant and that coexistence of both
+partitions is recovered only in a very limited number of cases. Our findings
+suggest that in most instances only one - in some way dominating - structure
+can be detected, even in the presence of other partitions. They underline the
+need for considering entire landscapes of partitions when different competing
+explanations exist and motivate future research to advance partition
+coexistence detection methods. Our model also contributes to the field of
+benchmark networks more generally by enabling further exploration of the
+ability of new and existing methods to detect ambiguity in the mesoscale
+structure of networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00436v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00436v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Miao, Yee Whye Teh, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in large language models (LLMs), especially the invention
+of chain-of-thought prompting, has made it possible to automatically answer
+questions by stepwise reasoning. However, when faced with more complicated
+problems that require non-linear thinking, even the strongest LLMs make
+mistakes. To address this, we explore whether LLMs are able to recognize errors
+in their own step-by-step reasoning, without resorting to external resources.
+To this end, we propose SelfCheck, a general-purpose zero-shot verification
+schema for recognizing such errors. We then use the results of these checks to
+improve question-answering performance by conducting weighted voting on
+multiple solutions to the question. We test SelfCheck on three datasets (GSM8K,
+MathQA, and MATH) and find that it successfully recognizes errors and, in turn,
+increases final answer accuracies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Anatomical Labeling of Pulmonary Tree Structures via Implicit
+  Point-Graph Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangxian Xie, Jiancheng Yang, Donglai Wei, Ziqiao Weng, Pascal Fua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pulmonary diseases rank prominently among the principal causes of death
+worldwide. Curing them will require, among other things, a better understanding
+of the many complex 3D tree-shaped structures within the pulmonary system, such
+as airways, arteries, and veins. In theory, they can be modeled using
+high-resolution image stacks. Unfortunately, standard CNN approaches operating
+on dense voxel grids are prohibitively expensive. To remedy this, we introduce
+a point-based approach that preserves graph connectivity of tree skeleton and
+incorporates an implicit surface representation. It delivers SOTA accuracy at a
+low computational cost and the resulting models have usable surfaces. Due to
+the scarcity of publicly accessible data, we have also curated an extensive
+dataset to evaluate our approach and will make it public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-modeling the Sequential and Graphical Routes for Peptide
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Ge Wang, Jiaqi Wang, Jiangbin Zheng, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peptides are formed by the dehydration condensation of multiple amino acids.
+The primary structure of a peptide can be represented either as an amino acid
+sequence or as a molecular graph consisting of atoms and chemical bonds.
+Previous studies have indicated that deep learning routes specific to
+sequential and graphical peptide forms exhibit comparable performance on
+downstream tasks. Despite the fact that these models learn representations of
+the same modality of peptides, we find that they explain their predictions
+differently. Considering sequential and graphical models as two experts making
+inferences from different perspectives, we work on fusing expert knowledge to
+enrich the learned representations for improving the discriminative
+performance. To achieve this, we propose a peptide co-modeling method, RepCon,
+which employs a contrastive learning-based framework to enhance the mutual
+information of representations from decoupled sequential and graphical
+end-to-end models. It considers representations from the sequential encoder and
+the graphical encoder for the same peptide sample as a positive pair and learns
+to enhance the consistency of representations between positive sample pairs and
+to repel representations between negative pairs. Empirical studies of RepCon
+and other co-modeling methods are conducted on open-source discriminative
+datasets, including aggregation propensity, retention time, antimicrobial
+peptide prediction, and family classification from Peptide Database. Our
+results demonstrate the superiority of the co-modeling approach over
+independent modeling, as well as the superiority of RepCon over other methods
+under the co-modeling framework. In addition, the attribution on RepCon further
+corroborates the validity of the approach at the level of model explanation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the definition of toxicity in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Berezin, Reza Farahbakhsh, Noel Crespi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fundamental problem in toxicity detection task lies in the fact that the
+toxicity is ill-defined. This causes us to rely on subjective and vague data in
+models' training, which results in non-robust and non-accurate results: garbage
+in - garbage out.
+  This work suggests a new, stress-level-based definition of toxicity designed
+to be objective and context-aware. On par with it, we also describe possible
+ways of applying this new definition to dataset creation and model training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characterization of causal ancestral graphs for time series with latent
+  confounders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.08417v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.08417v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Gerhardus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel class of graphical models for
+representing time lag specific causal relationships and independencies of
+multivariate time series with unobserved confounders. We completely
+characterize these graphs and show that they constitute proper subsets of the
+currently employed model classes. As we show, from the novel graphs one can
+thus draw stronger causal inferences -- without additional assumptions. We
+further introduce a graphical representation of Markov equivalence classes of
+the novel graphs. This graphical representation contains more causal knowledge
+than what current state-of-the-art causal discovery algorithms learn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>67 pages (including supplement), 16 figures, accepted at The Annals
+  of Statistics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probabilistically Rewired Message-Passing Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02156v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02156v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chendi Qian, Andrei Manolache, Kareem Ahmed, Zhe Zeng, Guy Van den Broeck, Mathias Niepert, Christopher Morris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message-passing graph neural networks (MPNNs) emerged as powerful tools for
+processing graph-structured input. However, they operate on a fixed input graph
+structure, ignoring potential noise and missing information. Furthermore, their
+local aggregation mechanism can lead to problems such as over-squashing and
+limited expressive power in capturing relevant graph structures. Existing
+solutions to these challenges have primarily relied on heuristic methods, often
+disregarding the underlying data distribution. Hence, devising principled
+approaches for learning to infer graph structures relevant to the given
+prediction task remains an open challenge. In this work, leveraging recent
+progress in exact and differentiable $k$-subset sampling, we devise
+probabilistically rewired MPNNs (PR-MPNNs), which learn to add relevant edges
+while omitting less beneficial ones. For the first time, our theoretical
+analysis explores how PR-MPNNs enhance expressive power, and we identify
+precise conditions under which they outperform purely randomized approaches.
+Empirically, we demonstrate that our approach effectively mitigates issues like
+over-squashing and under-reaching. In addition, on established real-world
+datasets, our method exhibits competitive or superior predictive performance
+compared to traditional MPNN models and recent graph transformer architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Algebraically Converging Stochastic Gradient Descent Algorithm for
+  Global Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.05923v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.05923v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Engquist, Kui Ren, Yunan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new gradient descent algorithm with added stochastic terms for
+finding the global optimizers of nonconvex optimization problems. A key
+component in the algorithm is the adaptive tuning of the randomness based on
+the value of the objective function. In the language of simulated annealing,
+the temperature is state-dependent. With this, we prove the global convergence
+of the algorithm with an algebraic rate both in probability and in the
+parameter space. This is a significant improvement over the classical rate from
+using a more straightforward control of the noise term. The convergence proof
+is based on the actual discrete setup of the algorithm, not just its continuous
+limit as often done in the literature. We also present several numerical
+examples to demonstrate the efficiency and robustness of the algorithm for
+reasonably complex objective functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Robust Statistics for Simulation-based Inference under Model
+  Misspecification <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15871v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15871v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daolang Huang, Ayush Bharti, Amauri Souza, Luigi Acerbi, Samuel Kaski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulation-based inference (SBI) methods such as approximate Bayesian
+computation (ABC), synthetic likelihood, and neural posterior estimation (NPE)
+rely on simulating statistics to infer parameters of intractable likelihood
+models. However, such methods are known to yield untrustworthy and misleading
+inference outcomes under model misspecification, thus hindering their
+widespread applicability. In this work, we propose the first general approach
+to handle model misspecification that works across different classes of SBI
+methods. Leveraging the fact that the choice of statistics determines the
+degree of misspecification in SBI, we introduce a regularized loss function
+that penalises those statistics that increase the mismatch between the data and
+the model. Taking NPE and ABC as use cases, we demonstrate the superior
+performance of our method on high-dimensional time-series models that are
+artificially misspecified. We also apply our method to real data from the field
+of radio propagation where the model is known to be misspecified. We show
+empirically that the method yields robust inference in misspecified scenarios,
+whilst still being accurate when the model is well-specified.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 13 figures, Published at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Logic of Differentiable Logics: Towards a Uniform Semantics of DL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10650v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10650v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Natalia Ślusarz, Ekaterina Komendantskaya, Matthew L. Daggitt, Robert Stewart, Kathrin Stark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentiable logics (DL) have recently been proposed as a method of
+training neural networks to satisfy logical specifications. A DL consists of a
+syntax in which specifications are stated and an interpretation function that
+translates expressions in the syntax into loss functions. These loss functions
+can then be used during training with standard gradient descent algorithms. The
+variety of existing DLs and the differing levels of formality with which they
+are treated makes a systematic comparative study of their properties and
+implementations difficult. This paper remedies this problem by suggesting a
+meta-language for defining DLs that we call the Logic of Differentiable Logics,
+or LDL. Syntactically, it generalises the syntax of existing DLs to FOL, and
+for the first time introduces the formalism for reasoning about vectors and
+learners. Semantically, it introduces a general interpretation function that
+can be instantiated to define loss functions arising from different existing
+DLs. We use LDL to establish several theoretical properties of existing DLs,
+and to conduct their empirical study in neural network verification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>LPAR'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Biologically Plausible Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17348v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17348v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matilde Tristany Farinha, Thomas Ortner, Giorgia Dellaferrera, Benjamin Grewe, Angeliki Pantazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Neural Networks (ANNs) trained with Backpropagation (BP) show
+astounding performance and are increasingly often used in performing our daily
+life tasks. However, ANNs are highly vulnerable to adversarial attacks, which
+alter inputs with small targeted perturbations that drastically disrupt the
+models' performance. The most effective method to make ANNs robust against
+these attacks is adversarial training, in which the training dataset is
+augmented with exemplary adversarial samples. Unfortunately, this approach has
+the drawback of increased training complexity since generating adversarial
+samples is very computationally demanding. In contrast to ANNs, humans are not
+susceptible to adversarial attacks. Therefore, in this work, we investigate
+whether biologically-plausible learning algorithms are more robust against
+adversarial attacks than BP. In particular, we present an extensive comparative
+analysis of the adversarial robustness of BP and Present the Error to Perturb
+the Input To modulate Activity (PEPITA), a recently proposed
+biologically-plausible learning algorithm, on various computer vision tasks. We
+observe that PEPITA has higher intrinsic adversarial robustness and, with
+adversarial training, has a more favourable natural-vs-adversarial performance
+trade-off as, for the same natural accuracies, PEPITA's adversarial accuracies
+decrease in average by 0.26% and BP's by 8.05%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modularizing while Training: A New Paradigm for Modularizing DNN Models <span class="chip">ICSE'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09376v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09376v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binhang Qi, Hailong Sun, Hongyu Zhang, Ruobing Zhao, Xiang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network (DNN) models have become increasingly crucial components
+in intelligent software systems. However, training a DNN model is typically
+expensive in terms of both time and money. To address this issue, researchers
+have recently focused on reusing existing DNN models - borrowing the idea of
+code reuse in software engineering. However, reusing an entire model could
+cause extra overhead or inherits the weakness from the undesired
+functionalities. Hence, existing work proposes to decompose an already trained
+model into modules, i.e., modularizing-after-training, and enable module reuse.
+Since trained models are not built for modularization,
+modularizing-after-training incurs huge overhead and model accuracy loss. In
+this paper, we propose a novel approach that incorporates modularization into
+the model training process, i.e., modularizing-while-training (MwT). We train a
+model to be structurally modular through two loss functions that optimize
+intra-module cohesion and inter-module coupling. We have implemented the
+proposed approach for modularizing Convolutional Neural Network (CNN) models in
+this work. The evaluation results on representative models demonstrate that MwT
+outperforms the state-of-the-art approach. Specifically, the accuracy loss
+caused by MwT is only 1.13 percentage points, which is 1.76 percentage points
+less than that of the baseline. The kernel retention rate of the modules
+generated by MwT is only 14.58%, with a reduction of 74.31% over the
+state-of-the-art approach. Furthermore, the total time cost required for
+training and modularizing is only 108 minutes, half of the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICSE'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Masked Convolutional <span class="highlight-title">Transformer</span> Block for Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.12148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.12148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neelu Madan, Nicolae-Catalin Ristea, Radu Tudor Ionescu, Kamal Nasrollahi, Fahad Shahbaz Khan, Thomas B. Moeslund, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection has recently gained increasing attention in the field of
+computer vision, likely due to its broad set of applications ranging from
+product fault detection on industrial production lines and impending event
+detection in video surveillance to finding lesions in medical scans. Regardless
+of the domain, anomaly detection is typically framed as a one-class
+classification task, where the learning is conducted on normal examples only.
+An entire family of successful anomaly detection methods is based on learning
+to reconstruct masked normal inputs (e.g. patches, future frames, etc.) and
+exerting the magnitude of the reconstruction error as an indicator for the
+abnormality level. Unlike other reconstruction-based methods, we present a
+novel self-supervised masked convolutional transformer block (SSMCTB) that
+comprises the reconstruction-based functionality at a core architectural level.
+The proposed self-supervised block is extremely flexible, enabling information
+masking at any layer of a neural network and being compatible with a wide range
+of neural architectures. In this work, we extend our previous self-supervised
+predictive convolutional attentive block (SSPCAB) with a 3D masked
+convolutional layer, a transformer for channel-wise attention, as well as a
+novel self-supervised objective based on Huber loss. Furthermore, we show that
+our block is applicable to a wider variety of tasks, adding anomaly detection
+in medical images and thermal videos to the previously considered tasks based
+on RGB images and surveillance videos. We exhibit the generality and
+flexibility of SSMCTB by integrating it into multiple state-of-the-art neural
+models for anomaly detection, bringing forth empirical results that confirm
+considerable performance improvements on five benchmarks. We release our code
+and data as open source at: https://github.com/ristea/ssmctb.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Maximum Likelihood Estimation of Latent Variable Structural Equation
+  Models: A Neural Network Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehrzad Saremi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a graphical structure for structural equation models that is
+stable under marginalization under linearity and Gaussianity assumptions. We
+show that computing the maximum likelihood estimation of this model is
+equivalent to training a neural network. We implement a GPU-based algorithm
+that computes the maximum likelihood estimation of these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlaceNav: Topological Navigation through Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17260v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17260v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lauri Suomela, Jussi Kalliola, Harry Edelman, Joni-Kristian Kämäräinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent results suggest that splitting topological navigation into
+robot-independent and robot-specific components improves navigation performance
+by enabling the robot-independent part to be trained with data collected by
+different robot types. However, the navigation methods are still limited by the
+scarcity of suitable training data and suffer from poor computational scaling.
+In this work, we present PlaceNav, subdividing the robot-independent part into
+navigation-specific and generic computer vision components. We utilize visual
+place recognition for the subgoal selection of the topological navigation
+pipeline. This makes subgoal selection more efficient and enables leveraging
+large-scale datasets from non-robotics sources, increasing training data
+availability. Bayesian filtering, enabled by place recognition, further
+improves navigation performance by increasing the temporal consistency of
+subgoals. Our experimental results verify the design and the new model obtains
+a 76% higher success rate in indoor and 23% higher in outdoor navigation tasks
+with higher computational efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Optimal Neural Networks: the Role of Sample Splitting in
+  Hyperparameter Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijin Gong, Xinyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When artificial neural networks have demonstrated exceptional practical
+success in a variety of domains, investigations into their theoretical
+characteristics, such as their approximation power, statistical properties, and
+generalization performance, have concurrently made significant strides. In this
+paper, we construct a novel theory for understanding the effectiveness of
+neural networks, which offers a perspective distinct from prior research.
+Specifically, we explore the rationale underlying a common practice during the
+construction of neural network models: sample splitting. Our findings indicate
+that the optimal hyperparameters derived from sample splitting can enable a
+neural network model that asymptotically minimizes the prediction risk. We
+conduct extensive experiments across different application scenarios and
+network architectures, and the results manifest our theory's effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DISCO-10M: A Large-Scale Music <span class="highlight-title">Dataset</span> <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca A. Lanzendörfer, Florian Grötschla, Emil Funke, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music datasets play a crucial role in advancing research in machine learning
+for music. However, existing music datasets suffer from limited size,
+accessibility, and lack of audio resources. To address these shortcomings, we
+present DISCO-10M, a novel and extensive music dataset that surpasses the
+largest previously available music dataset by an order of magnitude. To ensure
+high-quality data, we implement a multi-stage filtering process. This process
+incorporates similarities based on textual descriptions and audio embeddings.
+Moreover, we provide precomputed CLAP embeddings alongside DISCO-10M,
+facilitating direct application on various downstream tasks. These embeddings
+enable efficient exploration of machine learning applications on the provided
+data. With DISCO-10M, we aim to democratize and facilitate new research to help
+advance the development of novel machine learning models for music.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 Track on Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards practical reinforcement learning for tokamak magnetic control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brendan D. Tracey, Andrea Michi, Yuri Chervonyi, Ian Davies, Cosmin Paduraru, Nevena Lazic, Federico Felici, Timo Ewalds, Craig Donner, Cristian Galperti, Jonas Buchli, Michael Neunert, Andrea Huber, Jonathan Evens, Paula Kurylowicz, Daniel J. Mankowitz, Martin Riedmiller, The TCV Team
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) has shown promising results for real-time control
+systems, including the domain of plasma magnetic control. However, there are
+still significant drawbacks compared to traditional feedback control approaches
+for magnetic confinement. In this work, we address key drawbacks of the RL
+method; achieving higher control accuracy for desired plasma properties,
+reducing the steady-state error, and decreasing the required time to learn new
+tasks. We build on top of \cite{degrave2022magnetic}, and present algorithmic
+improvements to the agent architecture and training procedure. We present
+simulation results that show up to 65\% improvement in shape accuracy, achieve
+substantial reduction in the long-term bias of the plasma current, and
+additionally reduce the training time required to learn new tasks by a factor
+of 3 or more. We present new experiments using the upgraded RL-based
+controllers on the TCV tokamak, which validate the simulation results achieved,
+and point the way towards routinely achieving accurate discharges using the RL
+approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical Homomorphic Aggregation for Byzantine ML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05395v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05395v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Choffrut, Rachid Guerraoui, Rafael Pinot, Renaud Sirdey, John Stephan, Martin Zuber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the large-scale availability of data, machine learning (ML) algorithms
+are being deployed in distributed topologies, where different nodes collaborate
+to train ML models over their individual data by exchanging model-related
+information (e.g., gradients) with a central server. However, distributed
+learning schemes are notably vulnerable to two threats. First, Byzantine nodes
+can single-handedly corrupt the learning by sending incorrect information to
+the server, e.g., erroneous gradients. The standard approach to mitigate such
+behavior is to use a non-linear robust aggregation method at the server.
+Second, the server can violate the privacy of the nodes. Recent attacks have
+shown that exchanging (unencrypted) gradients enables a curious server to
+recover the totality of the nodes' data. The use of homomorphic encryption
+(HE), a gold standard security primitive, has extensively been studied as a
+privacy-preserving solution to distributed learning in non-Byzantine scenarios.
+However, due to HE's large computational demand especially for high-dimensional
+ML models, there has not yet been any attempt to design purely homomorphic
+operators for non-linear robust aggregators. In this work, we present SABLE,
+the first completely homomorphic and Byzantine robust distributed learning
+algorithm. SABLE essentially relies on a novel plaintext encoding method that
+enables us to implement the robust aggregator over batching-friendly BGV.
+Moreover, this encoding scheme also accelerates state-of-the-art homomorphic
+sorting with larger security margins and smaller ciphertext size. We perform
+extensive experiments on image classification tasks and show that our algorithm
+achieves practical execution times while matching the ML performance of its
+non-private counterpart.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Combining Differential Privacy and Byzantine Resilience in Distributed
+  SGD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.03991v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.03991v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachid Guerraoui, Nirupam Gupta, Rafael Pinot, Sebastien Rouault, John Stephan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy and Byzantine resilience (BR) are two crucial requirements of
+modern-day distributed machine learning. The two concepts have been extensively
+studied individually but the question of how to combine them effectively
+remains unanswered. This paper contributes to addressing this question by
+studying the extent to which the distributed SGD algorithm, in the standard
+parameter-server architecture, can learn an accurate model despite (a) a
+fraction of the workers being malicious (Byzantine), and (b) the other
+fraction, whilst being honest, providing noisy information to the server to
+ensure differential privacy (DP). We first observe that the integration of
+standard practices in DP and BR is not straightforward. In fact, we show that
+many existing results on the convergence of distributed SGD under Byzantine
+faults, especially those relying on $(\alpha,f)$-Byzantine resilience, are
+rendered invalid when honest workers enforce DP. To circumvent this
+shortcoming, we revisit the theory of $(\alpha,f)$-BR to obtain an approximate
+convergence guarantee. Our analysis provides key insights on how to improve
+this guarantee through hyperparameter optimization. Essentially, our
+theoretical and empirical results show that (1) an imprudent combination of
+standard approaches to DP and BR might be fruitless, but (2) by carefully
+re-tuning the learning algorithm, we can obtain reasonable learning accuracy
+while simultaneously guaranteeing DP and BR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Graph Laplacian with MCP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.11559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.11559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangjing Zhang, Kim-Chuan Toh, Defeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning a graph under the Laplacian constraint
+with a non-convex penalty: minimax concave penalty (MCP). For solving the MCP
+penalized graphical model, we design an inexact proximal difference-of-convex
+algorithm (DCA) and prove its convergence to critical points. We note that each
+subproblem of the proximal DCA enjoys the nice property that the objective
+function in its dual problem is continuously differentiable with a semismooth
+gradient. Therefore, we apply an efficient semismooth Newton method to
+subproblems of the proximal DCA. Numerical experiments on various synthetic and
+real data sets demonstrate the effectiveness of the non-convex penalty MCP in
+promoting sparsity. Compared with the existing state-of-the-art method, our
+method is demonstrated to be more efficient and reliable for learning graph
+Laplacian with MCP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formally Explaining Neural Networks within Reactive Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00143v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00143v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahaf Bassan, Guy Amir, Davide Corsi, Idan Refaeli, Guy Katz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are increasingly being used as controllers in
+reactive systems. However, DNNs are highly opaque, which renders it difficult
+to explain and justify their actions. To mitigate this issue, there has been a
+surge of interest in explainable AI (XAI) techniques, capable of pinpointing
+the input features that caused the DNN to act as it did. Existing XAI
+techniques typically face two limitations: (i) they are heuristic, and do not
+provide formal guarantees that the explanations are correct; and (ii) they
+often apply to ``one-shot'' systems, where the DNN is invoked independently of
+past invocations, as opposed to reactive systems. Here, we begin bridging this
+gap, and propose a formal DNN-verification-based XAI technique for reasoning
+about multi-step, reactive systems. We suggest methods for efficiently
+calculating succinct explanations, by exploiting the system's transition
+constraints in order to curtail the search space explored by the underlying
+verifier. We evaluate our approach on two popular benchmarks from the domain of
+automated navigation; and observe that our methods allow the efficient
+computation of minimal and minimum explanations, significantly outperforming
+the state of the art. We also demonstrate that our methods produce formal
+explanations that are more reliable than competing, non-verification-based XAI
+techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Proc. 23rd Int. Conf. on Formal Methods in
+  Computer-Aided Design (FMCAD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DyVal: Graph-informed Dynamic Evaluation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijie Zhu, Jiaao Chen, Jindong Wang, Neil Zhenqiang Gong, Diyi Yang, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable performance in various
+evaluation benchmarks. However, concerns about their performance are raised on
+potential data contamination in their considerable volume of training corpus.
+Moreover, the static nature and fixed complexity of current benchmarks may
+inadequately gauge the advancing capabilities of LLMs. In this paper, we
+introduce DyVal, a novel, general, and flexible evaluation protocol for dynamic
+evaluation of LLMs. Based on our proposed dynamic evaluation framework, we
+build graph-informed DyVal by leveraging the structural advantage of directed
+acyclic graphs to dynamically generate evaluation samples with controllable
+complexities. DyVal generates challenging evaluation sets on reasoning tasks
+including mathematics, logical reasoning, and algorithm problems. We evaluate
+various LLMs ranging from Flan-T5-large to ChatGPT and GPT4. Experiments
+demonstrate that LLMs perform worse in DyVal-generated evaluation samples with
+different complexities, emphasizing the significance of dynamic evaluation. We
+also analyze the failure cases and results of different prompting methods.
+Moreover, DyVal-generated samples are not only evaluation sets, but also
+helpful data for fine-tuning to improve the performance of LLMs on existing
+benchmarks. We hope that DyVal can shed light on the future evaluation research
+of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report; 36 pages; code will be released at aka.ms/dyval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Representations on the Unit Sphere: Investigating Angular
+  Gaussian and von Mises-Fisher Distributions for Online Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03364v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03364v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Michel, Giovanni Chierchia, Romain Negrel, Jean-François Bercher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use the maximum a posteriori estimation principle for learning
+representations distributed on the unit sphere. We propose to use the angular
+Gaussian distribution, which corresponds to a Gaussian projected on the
+unit-sphere and derive the associated loss function. We also consider the von
+Mises-Fisher distribution, which is the conditional of a Gaussian in the
+unit-sphere. The learned representations are pushed toward fixed directions,
+which are the prior means of the Gaussians; allowing for a learning strategy
+that is resilient to data drift. This makes it suitable for online continual
+learning, which is the problem of training neural networks on a continuous data
+stream, where multiple classification tasks are presented sequentially so that
+data from past tasks are no longer accessible, and data from the current task
+can be seen only once. To address this challenging scenario, we propose a
+memory-based representation learning technique equipped with our new loss
+functions. Our approach does not require negative data or knowledge of task
+boundaries and performs well with smaller batch sizes while being
+computationally efficient. We demonstrate with extensive experiments that the
+proposed method outperforms the current state-of-the-art methods on both
+standard evaluation scenarios and realistic scenarios with blurry task
+boundaries. For reproducibility, we use the same training pipeline for every
+compared method and share the code at https://t.ly/SQTj.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, under review, update title</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GOAL: A Challenging Knowledge-grounded Video Captioning Benchmark for
+  Real-time Soccer Commentary Generation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Qi, Jifan Yu, Teng Tu, Kunyu Gao, Yifan Xu, Xinyu Guan, Xiaozhi Wang, Yuxiao Dong, Bin Xu, Lei Hou, Juanzi Li, Jie Tang, Weidong Guo, Hui Liu, Yu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent emergence of video captioning models, how to generate
+vivid, fine-grained video descriptions based on the background knowledge (i.e.,
+long and informative commentary about the domain-specific scenes with
+appropriate reasoning) is still far from being solved, which however has great
+applications such as automatic sports narrative. In this paper, we present
+GOAL, a benchmark of over 8.9k soccer video clips, 22k sentences, and 42k
+knowledge triples for proposing a challenging new task setting as
+Knowledge-grounded Video Captioning (KGVC). Moreover, we conduct experimental
+adaption of existing methods to show the difficulty and potential directions
+for solving this valuable and applicable task. Our data and code are available
+at https://github.com/THU-KEG/goal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust 3D Object Detection In Rainy Conditions <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aldi Piroli, Vinzenz Dallabetta, Johannes Kopp, Marc Walessa, Daniel Meissner, Klaus Dietmayer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR sensors are used in autonomous driving applications to accurately
+perceive the environment. However, they are affected by adverse weather
+conditions such as snow, fog, and rain. These everyday phenomena introduce
+unwanted noise into the measurements, severely degrading the performance of
+LiDAR-based perception systems. In this work, we propose a framework for
+improving the robustness of LiDAR-based 3D object detectors against road spray.
+Our approach uses a state-of-the-art adverse weather detection network to
+filter out spray from the LiDAR point cloud, which is then used as input for
+the object detector. In this way, the detected objects are less affected by the
+adverse weather in the scene, resulting in a more accurate perception of the
+environment. In addition to adverse weather filtering, we explore the use of
+radar targets to further filter false positive detections. Tests on real-world
+data show that our approach improves the robustness to road spray of several
+popular 3D object detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE International Conference on Intelligent
+  Transportation Systems ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PINNacle: A Comprehensive Benchmark of Physics-Informed Neural Networks
+  for Solving PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongkai Hao, Jiachen Yao, Chang Su, Hang Su, Ziao Wang, Fanzhi Lu, Zeyu Xia, Yichi Zhang, Songming Liu, Lu Lu, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While significant progress has been made on Physics-Informed Neural Networks
+(PINNs), a comprehensive comparison of these methods across a wide range of
+Partial Differential Equations (PDEs) is still lacking. This study introduces
+PINNacle, a benchmarking tool designed to fill this gap. PINNacle provides a
+diverse dataset, comprising over 20 distinct PDEs from various domains,
+including heat conduction, fluid dynamics, biology, and electromagnetics. These
+PDEs encapsulate key challenges inherent to real-world problems, such as
+complex geometry, multi-scale phenomena, nonlinearity, and high dimensionality.
+PINNacle also offers a user-friendly toolbox, incorporating about 10
+state-of-the-art PINN methods for systematic evaluation and comparison. We have
+conducted extensive experiments with these methods, offering insights into
+their strengths and weaknesses. In addition to providing a standardized means
+of assessing performance, PINNacle also offers an in-depth analysis to guide
+future research, particularly in areas such as domain decomposition methods and
+loss reweighting for handling multi-scale problems and complex geometry. To the
+best of our knowledge, it is the largest benchmark with a diverse and
+comprehensive evaluation that will undoubtedly foster further research in
+PINNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MediTab: Scaling Medical Tabular Data Predictors via Data Consolidation,
+  Enrichment, and Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifeng Wang, Chufan Gao, Cao Xiao, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data prediction has been employed in medical applications such as
+patient health risk prediction. However, existing methods usually revolve
+around the algorithm design while overlooking the significance of data
+engineering. Medical tabular datasets frequently exhibit significant
+heterogeneity across different sources, with limited sample sizes per source.
+As such, previous predictors are often trained on manually curated small
+datasets that struggle to generalize across different tabular datasets during
+inference. This paper proposes to scale medical tabular data predictors
+(MediTab) to various tabular inputs with varying features. The method uses a
+data engine that leverages large language models (LLMs) to consolidate tabular
+samples to overcome the barrier across tables with distinct schema. It also
+aligns out-domain data with the target task using a "learn, annotate, and
+refinement" pipeline. The expanded training data then enables the pre-trained
+MediTab to infer for arbitrary tabular input in the domain without fine-tuning,
+resulting in significant improvements over supervised baselines: it reaches an
+average ranking of 1.57 and 1.00 on 7 patient outcome prediction datasets and 3
+trial outcome prediction datasets, respectively. In addition, MediTab exhibits
+impressive zero-shot performances: it outperforms supervised XGBoost models by
+8.9% and 17.2% on average in two prediction tasks, respectively. The code is
+available at https://github.com/RyanWangZf/MediTab.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15357v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15357v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Ma, Huan Yang, Wenhan Yang, Jianlong Fu, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models, as a kind of powerful generative model, have given
+impressive results on image super-resolution (SR) tasks. However, due to the
+randomness introduced in the reverse process of diffusion models, the
+performances of diffusion-based SR models are fluctuating at every time of
+sampling, especially for samplers with few resampled steps. This inherent
+randomness of diffusion models results in ineffectiveness and instability,
+making it challenging for users to guarantee the quality of SR results.
+However, our work takes this randomness as an opportunity: fully analyzing and
+leveraging it leads to the construction of an effective plug-and-play sampling
+method that owns the potential to benefit a series of diffusion-based SR
+methods. More in detail, we propose to steadily sample high-quality SR images
+from pre-trained diffusion-based SR models by solving diffusion ordinary
+differential equations (diffusion ODEs) with optimal boundary conditions (BCs)
+and analyze the characteristics between the choices of BCs and their
+corresponding SR results. Our analysis shows the route to obtain an
+approximately optimal BC via an efficient exploration in the whole space. The
+quality of SR results sampled by the proposed method with fewer steps
+outperforms the quality of results sampled by current methods with randomness
+from the same pre-trained diffusion-based SR model, which means that our
+sampling method "boosts" current diffusion-based SR models without any
+additional training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Framework for Large Scale Synthetic Graph <span class="highlight-title">Dataset</span> Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01944v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01944v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajad Darabi, Piotr Bigaj, Dawid Majchrowski, Artur Kasymov, Pawel Morkisz, Alex Fit-Florea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently there has been increasing interest in developing and deploying deep
+graph learning algorithms for many tasks, such as fraud detection and
+recommender systems. Albeit, there is a limited number of publicly available
+graph-structured datasets, most of which are tiny compared to production-sized
+applications or are limited in their application domain. This work tackles this
+shortcoming by proposing a scalable synthetic graph generation tool to scale
+the datasets to production-size graphs with trillions of edges and billions of
+nodes. The tool learns a series of parametric models from proprietary datasets
+that can be released to researchers to study various graph methods on the
+synthetic data increasing prototype development and novel applications. We
+demonstrate the generalizability of the framework across a series of datasets,
+mimicking structural and feature distributions as well as the ability to scale
+them across varying sizes demonstrating their usefulness for benchmarking and
+model development. Code can be found on
+https://github.com/NVIDIA/DeepLearningExamples/tree/master/Tools/DGLPyTorch/SyntheticGraphGeneration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unpaired Image-to-Image Translation via Neural Schrödinger Bridge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beomsu Kim, Gihyun Kwon, Kwanyoung Kim, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are a powerful class of generative models which simulate
+stochastic differential equations (SDEs) to generate data from noise. Although
+diffusion models have achieved remarkable progress in recent years, they have
+limitations in the unpaired image-to-image translation tasks due to the
+Gaussian prior assumption. Schr\"odinger Bridge (SB), which learns an SDE to
+translate between two arbitrary distributions, have risen as an attractive
+solution to this problem. However, none of SB models so far have been
+successful at unpaired translation between high-resolution images. In this
+work, we propose the Unpaired Neural Schr\"odinger Bridge (UNSB), which
+expresses SB problem as a sequence of adversarial learning problems. This
+allows us to incorporate advanced discriminators and regularization to learn a
+SB between unpaired data. We demonstrate that UNSB is scalable and successfully
+solves various unpaired image-to-image translation tasks. Code:
+\url{https://github.com/cyclomon/UNSB}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Implicit Bias of Adam 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00079v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00079v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matias D. Cattaneo, Jason M. Klusowski, Boris Shigida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In previous literature, backward error analysis was used to find ordinary
+differential equations (ODEs) approximating the gradient descent trajectory. It
+was found that finite step sizes implicitly regularize solutions because terms
+appearing in the ODEs penalize the two-norm of the loss gradients. We prove
+that the existence of similar implicit regularization in RMSProp and Adam
+depends on their hyperparameters and the training stage, but with a different
+"norm" involved: the corresponding ODE terms either penalize the (perturbed)
+one-norm of the loss gradients or, on the contrary, hinder its decrease (the
+latter case being typical). We also conduct numerical experiments and discuss
+how the proven facts can influence generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linking Across Data Granularity: Fitting Multivariate Hawkes Processes
+  to Partially Interval-Censored Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02062v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02062v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pio Calderon, Alexander Soen, Marian-Andrei Rizoiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multivariate Hawkes process (MHP) is widely used for analyzing data
+streams that interact with each other, where events generate new events within
+their own dimension (via self-excitation) or across different dimensions (via
+cross-excitation). However, in certain applications, the timestamps of
+individual events in some dimensions are unobservable, and only event counts
+within intervals are known, referred to as partially interval-censored data.
+The MHP is unsuitable for handling such data since its estimation requires
+event timestamps. In this study, we introduce the Partial Mean Behavior Poisson
+(PMBP) process, a novel point process which shares parameter equivalence with
+the MHP and can effectively model both timestamped and interval-censored data.
+We demonstrate the capabilities of the PMBP process using synthetic and
+real-world datasets. Firstly, we illustrate that the PMBP process can
+approximate MHP parameters and recover the spectral radius using synthetic
+event histories. Next, we assess the performance of the PMBP process in
+predicting YouTube popularity and find that it surpasses state-of-the-art
+methods. Lastly, we leverage the PMBP process to gain qualitative insights from
+a dataset comprising daily COVID-19 case counts from multiple countries and
+COVID-19-related news articles. By clustering the PMBP-modeled countries, we
+unveil hidden interaction patterns between occurrences of COVID-19 cases and
+news reporting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferring Annotator- and Instance-dependent Transition Matrix for
+  Learning from Crowds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03116v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03116v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shikun Li, Xiaobo Xia, Jiankang Deng, Shiming Ge, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from crowds describes that the annotations of training data are
+obtained with crowd-sourcing services. Multiple annotators each complete their
+own small part of the annotations, where labeling mistakes that depend on
+annotators occur frequently. Modeling the label-noise generation process by the
+noise transition matrix is a power tool to tackle the label noise. In
+real-world crowd-sourcing scenarios, noise transition matrices are both
+annotator- and instance-dependent. However, due to the high complexity of
+annotator- and instance-dependent transition matrices (AIDTM), annotation
+sparsity, which means each annotator only labels a little part of instances,
+makes modeling AIDTM very challenging. Prior works simplify the problem by
+assuming the transition matrix is instance-independent or using simple
+parametric ways, which lose modeling generality. Motivated by this, we target a
+more realistic problem, estimating general AIDTM in practice. Without losing
+modeling generality, we parameterize AIDTM with deep neural networks. To
+alleviate the modeling challenge, we suppose every annotator shares its noise
+pattern with similar annotators, and estimate AIDTM via knowledge transfer. We
+hence first model the mixture of noise patterns by all annotators, and then
+transfer this modeling to individual annotators. Furthermore, considering that
+the transfer from the mixture of noise patterns to individuals may cause two
+annotators with highly different noise generations to perturb each other, we
+employ the knowledge transfer between identified neighboring annotators to
+calibrate the modeling. Theoretical analyses are derived to demonstrate that
+both the knowledge transfer from global to individuals and the knowledge
+transfer between neighboring individuals can help model general AIDTM.
+Experiments confirm the superiority of the proposed approach on synthetic and
+real-world crowd-sourcing data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private GANs, Revisited 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Bie, Gautam Kamath, Guojun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the canonical approach for training differentially private GANs
+-- updating the discriminator with differentially private stochastic gradient
+descent (DPSGD) -- can yield significantly improved results after modifications
+to training. Specifically, we propose that existing instantiations of this
+approach neglect to consider how adding noise only to discriminator updates
+inhibits discriminator training, disrupting the balance between the generator
+and discriminator necessary for successful GAN training. We show that a simple
+fix -- taking more discriminator steps between generator steps -- restores
+parity between the generator and discriminator and improves results.
+  Additionally, with the goal of restoring parity, we experiment with other
+modifications -- namely, large batch sizes and adaptive discriminator update
+frequency -- to improve discriminator training and see further improvements in
+generation quality. Our results demonstrate that on standard image synthesis
+benchmarks, DPSGD outperforms all alternative GAN privatization schemes. Code:
+https://github.com/alexbie98/dpgan-revisit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages; revisions and new experiments from TMLR camera-ready + code
+  release at https://github.com/alexbie98/dpgan-revisit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PIE: Simulating Disease Progression via Progressive Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaizhao Liang, Xu Cao, Kuei-Da Liao, Tianren Gao, Wenqian Ye, Zhengyu Chen, Jianguo Cao, Tejas Nama, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disease progression simulation is a crucial area of research that has
+significant implications for clinical diagnosis, prognosis, and treatment. One
+major challenge in this field is the lack of continuous medical imaging
+monitoring of individual patients over time. To address this issue, we develop
+a novel framework termed Progressive Image Editing (PIE) that enables
+controlled manipulation of disease-related image features, facilitating precise
+and realistic disease progression simulation. Specifically, we leverage recent
+advancements in text-to-image generative models to simulate disease progression
+accurately and personalize it for each patient. We theoretically analyze the
+iterative refining process in our framework as a gradient descent with an
+exponentially decayed learning rate. To validate our framework, we conduct
+experiments in three medical imaging domains. Our results demonstrate the
+superiority of PIE over existing methods such as Stable Diffusion Walk and
+Style-Based Manifold Extrapolation based on CLIP score (Realism) and Disease
+Classification Confidence (Alignment). Our user study collected feedback from
+35 veteran physicians to assess the generated progressions. Remarkably, 76.2%
+of the feedback agrees with the fidelity of the generated progressions. To our
+best knowledge, PIE is the first of its kind to generate disease progression
+images meeting real-world standards. It is a promising tool for medical
+research and clinical practice, potentially allowing healthcare providers to
+model disease trajectories over time, predict future treatment responses, and
+improve patient outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and checkpoints for replicating our results can be found at
+  https://github.com/IrohXu/PIE and
+  https://huggingface.co/IrohXu/stable-diffusion-mimic-cxr-v0.1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.14883v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.14883v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenggui Li, Hongxin Liu, Zhengda Bian, Jiarui Fang, Haichen Huang, Yuliang Liu, Boxiang Wang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of Transformer models has pushed the deep learning model scale to
+billions of parameters. Due to the limited memory resource of a single GPU,
+However, the best practice for choosing the optimal parallel strategy is still
+lacking, since it requires domain expertise in both deep learning and parallel
+computing.
+  The Colossal-AI system addressed the above challenge by introducing a unified
+interface to scale your sequential code of model training to distributed
+environments. It supports parallel training methods such as data, pipeline,
+tensor, and sequence parallelism, as well as heterogeneous training methods
+integrated with zero redundancy optimizer. Compared to the baseline system,
+Colossal-AI can achieve up to 2.76 times training speedup on large-scale
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12439v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12439v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tinghao Xie, Xiangyu Qi, Ping He, Yiming Li, Jiachen T. Wang, Prateek Mittal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel defense, against backdoor attacks on Deep Neural Networks
+(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors)
+into DNNs. Our defense falls within the category of post-development defenses
+that operate independently of how the model was generated. The proposed defense
+is built upon a novel reverse engineering approach that can directly extract
+backdoor functionality of a given backdoored model to a backdoor expert model.
+The approach is straightforward -- finetuning the backdoored model over a small
+set of intentionally mislabeled clean samples, such that it unlearns the normal
+functionality while still preserving the backdoor functionality, and thus
+resulting in a model (dubbed a backdoor expert model) that can only recognize
+backdoor inputs. Based on the extracted backdoor expert model, we show the
+feasibility of devising highly accurate backdoor input detectors that filter
+out the backdoor inputs during model inference. Further augmented by an
+ensemble strategy with a finetuned auxiliary model, our defense, BaDExpert
+(Backdoor Input Detection with Backdoor Expert), effectively mitigates 17 SOTA
+backdoor attacks while minimally impacting clean utility. The effectiveness of
+BaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet)
+across various model architectures (ResNet, VGG, MobileNetV2 and Vision
+Transformer).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RL-based Stateful Neural Adaptive Sampling and Denoising for Real-Time
+  Path Tracing <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Scardigli, Lukas Cavigelli, Lorenz K. Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monte-Carlo path tracing is a powerful technique for realistic image
+synthesis but suffers from high levels of noise at low sample counts, limiting
+its use in real-time applications. To address this, we propose a framework with
+end-to-end training of a sampling importance network, a latent space encoder
+network, and a denoiser network. Our approach uses reinforcement learning to
+optimize the sampling importance network, thus avoiding explicit numerically
+approximated gradients. Our method does not aggregate the sampled values per
+pixel by averaging but keeps all sampled values which are then fed into the
+latent space encoder. The encoder replaces handcrafted spatiotemporal
+heuristics by learned representations in a latent space. Finally, a neural
+denoiser is trained to refine the output image. Our approach increases visual
+quality on several challenging datasets and reduces rendering times for equal
+quality by a factor of 1.6x compared to the previous state-of-the-art, making
+it a promising solution for real-time applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS. https://openreview.net/forum?id=xNyR7DXUzJ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Resolution Audio-Visual Feature Fusion for Temporal Action
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Fish, Jon Weinbren, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Action Localization (TAL) aims to identify actions' start, end, and
+class labels in untrimmed videos. While recent advancements using transformer
+networks and Feature Pyramid Networks (FPN) have enhanced visual feature
+recognition in TAL tasks, less progress has been made in the integration of
+audio features into such frameworks. This paper introduces the Multi-Resolution
+Audio-Visual Feature Fusion (MRAV-FF), an innovative method to merge
+audio-visual data across different temporal resolutions. Central to our
+approach is a hierarchical gated cross-attention mechanism, which discerningly
+weighs the importance of audio information at diverse temporal scales. Such a
+technique not only refines the precision of regression boundaries but also
+bolsters classification confidence. Importantly, MRAV-FF is versatile, making
+it compatible with existing FPN TAL architectures and offering a significant
+enhancement in performance when audio data is available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward One-Second Latency: Evolution of Live Media Streaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelhak Bentaleb, May Lim, Mehmet N. Akcay, Ali C. Begen, Sarra Hammoudi, Roger Zimmermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey presents the evolution of live media streaming and the
+technological developments behind today's IP-based low-latency live streaming
+systems. Live streaming primarily involves capturing, encoding, packaging and
+delivering real-time events such as live sports, live news, personal broadcasts
+and surveillance videos. Live streaming also involves concurrent streaming of
+linear TV programming off the satellite, cable, over-the-air or IPTV broadcast,
+where the programming is not necessarily a real-time event. The survey starts
+with a discussion on the latency and latency continuum in streaming
+applications. Then, it lays out the existing live streaming workflows and
+protocols, followed by an in-depth analysis of the latency sources in these
+workflows and protocols. The survey continues with the technology enablers,
+low-latency extensions for the popular HTTP adaptive streaming methods and
+enhancements for robust low-latency playback. An entire section is dedicated to
+the detailed summary and findings of Twitch's grand challenge on low-latency
+live streaming. The survey concludes with a discussion of ongoing research
+problems in this space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-04T00:00:00Z">2023-10-04</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">76</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LanguageMPC: Large Language Models as Decision Makers for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka, Wei Zhan, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based autonomous driving (AD) systems face challenges in
+comprehending high-level information, generalizing to rare events, and
+providing interpretability. To address these problems, this work employs Large
+Language Models (LLMs) as a decision-making component for complex AD scenarios
+that require human commonsense understanding. We devise cognitive pathways to
+enable comprehensive reasoning with LLMs, and develop algorithms for
+translating LLM decisions into actionable driving commands. Through this
+approach, LLM decisions are seamlessly integrated with low-level controllers by
+guided parameter matrix adaptation. Extensive experiments demonstrate that our
+proposed method not only consistently surpasses baseline approaches in
+single-vehicle tasks, but also helps handle complex driving behaviors even
+multi-vehicle coordination, thanks to the commonsense reasoning capabilities of
+LLMs. This paper presents an initial step toward leveraging LLMs as effective
+decision-makers for intricate AD scenarios in terms of safety, efficiency,
+generalizability, and interoperability. We aspire for it to serve as
+inspiration for future research in this field. Project page:
+https://sites.google.com/view/llm-mpc
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval meets Long Context Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extending the context window of large language models (LLMs) is getting
+popular recently, while the solution of augmenting LLMs with retrieval has
+existed for years. The natural questions are: i) Retrieval-augmentation versus
+long context window, which one is better for downstream tasks? ii) Can both
+methods be combined to get the best of both worlds? In this work, we answer
+these questions by studying both solutions using two state-of-the-art
+pretrained LLMs, i.e., a proprietary 43B GPT and LLaMA2-70B. Perhaps
+surprisingly, we find that LLM with 4K context window using simple
+retrieval-augmentation at generation can achieve comparable performance to
+finetuned LLM with 16K context window via positional interpolation on long
+context tasks, while taking much less computation. More importantly, we
+demonstrate that retrieval can significantly improve the performance of LLMs
+regardless of their extended context window sizes. Our best model,
+retrieval-augmented LLaMA2-70B with 32K context window, outperforms
+GPT-3.5-turbo-16k and Davinci003 in terms of average score on seven long
+context tasks including question answering and query-based summarization. It
+also outperforms its non-retrieval LLaMA2-70B-32k baseline by a margin, while
+being much faster at generation. Our study provides general insights on the
+choice of retrieval-augmentation versus long context extension of LLM for
+practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero Resource Code-switched Speech Benchmark Using Speech Utterance
+  Pairs For Multiple Spoken Languages <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan-Po Huang, Chih-Kai Yang, Yu-Kuan Fu, Ewan Dunbar, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new zero resource code-switched speech benchmark designed to
+directly assess the code-switching capabilities of self-supervised speech
+encoders. We showcase a baseline system of language modeling on discrete units
+to demonstrate how the code-switching abilities of speech encoders can be
+assessed in a zero-resource manner. Our experiments encompass a variety of
+well-known speech encoders, including Wav2vec 2.0, HuBERT, XLSR, etc. We
+examine the impact of pre-training languages and model size on benchmark
+performance. Notably, though our results demonstrate that speech encoders with
+multilingual pre-training, exemplified by XLSR, outperform monolingual variants
+(Wav2vec 2.0, HuBERT) in code-switching scenarios, there is still substantial
+room for improvement in their code-switching linguistic abilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Question Answering for Unified Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Sun, Kai Zhang, Yu Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal information extraction (MIE) aims to extract structured
+information from unstructured multimedia content. Due to the diversity of tasks
+and settings, most current MIE models are task-specific and data-intensive,
+which limits their generalization to real-world scenarios with diverse task
+requirements and limited labeled data. To address these issues, we propose a
+novel multimodal question answering (MQA) framework to unify three MIE tasks by
+reformulating them into a unified span extraction and multi-choice QA pipeline.
+Extensive experiments on six datasets show that: 1) Our MQA framework
+consistently and significantly improves the performances of various
+off-the-shelf large multimodal models (LMM) on MIE tasks, compared to vanilla
+prompting. 2) In the zero-shot setting, MQA outperforms previous
+state-of-the-art baselines by a large margin. In addition, the effectiveness of
+our framework can successfully transfer to the few-shot setting, enhancing LMMs
+on a scale of 10B parameters to be competitive or outperform much larger
+language models such as ChatGPT and GPT-4. Our MQA framework can serve as a
+general principle of utilizing LMMs to better solve MIE and potentially other
+downstream multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding In-Context Learning in <span class="highlight-title">Transformer</span>s and LLMs by Learning
+  to Learn Discrete Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satwik Bhattamishra, Arkil Patel, Phil Blunsom, Varun Kanade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to understand the in-context learning phenomenon, recent works have
+adopted a stylized experimental framework and demonstrated that Transformers
+can learn gradient-based learning algorithms for various classes of real-valued
+functions. However, the limitations of Transformers in implementing learning
+algorithms, and their ability to learn other forms of algorithms are not well
+understood. Additionally, the degree to which these capabilities are confined
+to attention-based models is unclear. Furthermore, it remains to be seen
+whether the insights derived from these stylized settings can be extrapolated
+to pretrained Large Language Models (LLMs). In this work, we take a step
+towards answering these questions by demonstrating the following: (a) On a
+test-bed with a variety of Boolean function classes, we find that Transformers
+can nearly match the optimal learning algorithm for 'simpler' tasks, while
+their performance deteriorates on more 'complex' tasks. Additionally, we find
+that certain attention-free models perform (almost) identically to Transformers
+on a range of tasks. (b) When provided a teaching sequence, i.e. a set of
+examples that uniquely identifies a function in a class, we show that
+Transformers learn more sample-efficiently. Interestingly, our results show
+that Transformers can learn to implement two distinct algorithms to solve a
+single task, and can adaptively select the more sample-efficient algorithm
+depending on the sequence of in-context examples. (c) Lastly, we show that
+extant LLMs, e.g. LLaMA-2, GPT-4, can compete with nearest-neighbor baselines
+on prediction tasks that are guaranteed to not be in their training set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Words to Watts: Benchmarking the Energy Costs of Large Language
+  Model Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Samsi, Dan Zhao, Joseph McDonald, Baolin Li, Adam Michaleas, Michael Jones, William Bergeron, Jeremy Kepner, Devesh Tiwari, Vijay Gadepally
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have exploded in popularity due to their new
+generative capabilities that go far beyond prior state-of-the-art. These
+technologies are increasingly being leveraged in various domains such as law,
+finance, and medicine. However, these models carry significant computational
+challenges, especially the compute and energy costs required for inference.
+Inference energy costs already receive less attention than the energy costs of
+training LLMs -- despite how often these large models are called on to conduct
+inference in reality (e.g., ChatGPT). As these state-of-the-art LLMs see
+increasing usage and deployment in various domains, a better understanding of
+their resource utilization is crucial for cost-savings, scaling performance,
+efficient hardware usage, and optimal inference strategies.
+  In this paper, we describe experiments conducted to study the computational
+and energy utilization of inference with LLMs. We benchmark and conduct a
+preliminary analysis of the inference performance and inference energy costs of
+different sizes of LLaMA -- a recent state-of-the-art LLM -- developed by Meta
+AI on two generations of popular GPUs (NVIDIA V100 \& A100) and two datasets
+(Alpaca and GSM8K) to reflect the diverse set of tasks/benchmarks for LLMs in
+research and practice. We present the results of multi-node, multi-GPU
+inference using model sharding across up to 32 GPUs. To our knowledge, our work
+is the one of the first to study LLM inference performance from the perspective
+of computational and energy resources at this scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECoFLaP: Efficient Coarse-to-Fine Layer-Wise Pruning for Vision-Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Sung, Jaehong Yoon, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) can understand the world comprehensively
+by integrating rich information from different modalities, achieving remarkable
+performance improvements on various multimodal downstream tasks. However,
+deploying LVLMs is often problematic due to their massive computational/energy
+costs and carbon consumption. Such issues make it infeasible to adopt
+conventional iterative global pruning, which is costly due to computing the
+Hessian matrix of the entire large model for sparsification. Alternatively,
+several studies have recently proposed layer-wise pruning approaches to avoid
+the expensive computation of global pruning and efficiently compress model
+weights according to their importance within a layer. However, these methods
+often suffer from suboptimal model compression due to their lack of a global
+perspective. To address this limitation in recent efficient pruning methods for
+large models, we propose Efficient Coarse-to-Fine Layer-Wise Pruning (ECoFLaP),
+a two-stage coarse-to-fine weight pruning approach for LVLMs. We first
+determine the sparsity ratios of different layers or blocks by leveraging the
+global importance score, which is efficiently computed based on the
+zeroth-order approximation of the global model gradients. Then, the multimodal
+model performs local layer-wise unstructured weight pruning based on
+globally-informed sparsity ratios. We validate our proposed method across
+various multimodal and unimodal models and datasets, demonstrating significant
+performance improvements over prevalent pruning techniques in the high-sparsity
+regime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ecoflap.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kosmos-G: Generating Images in Context with Multimodal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xichen Pan, Li Dong, Shaohan Huang, Zhiliang Peng, Wenhu Chen, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in text-to-image (T2I) and vision-language-to-image
+(VL2I) generation have made significant strides. However, the generation from
+generalized vision-language inputs, especially involving multiple images,
+remains under-explored. This paper presents Kosmos-G, a model that leverages
+the advanced perception capabilities of Multimodal Large Language Models
+(MLLMs) to tackle the aforementioned challenge. Our approach aligns the output
+space of MLLM with CLIP using the textual modality as an anchor and performs
+compositional instruction tuning on curated data. Kosmos-G demonstrates a
+unique capability of zero-shot multi-entity subject-driven generation. Notably,
+the score distillation instruction tuning requires no modifications to the
+image decoder. This allows for a seamless substitution of CLIP and effortless
+integration with a myriad of U-Net techniques ranging from fine-grained
+controls to personalized image decoder variants. We posit Kosmos-G as an
+initial attempt towards the goal of "image as a foreign language in image
+generation."
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://aka.ms/Kosmos-G Project Page:
+  https://xichenpan.github.io/kosmosg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xVal: A Continuous Number Encoding for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siavash Golkar, Mariel Pettee, Michael Eickenberg, Alberto Bietti, Miles Cranmer, Geraud Krawezik, Francois Lanusse, Michael McCabe, Ruben Ohana, Liam Parker, Bruno Régaldo-Saint Blancard, Tiberiu Tesileanu, Kyunghyun Cho, Shirley Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models have not yet been broadly adapted for the analysis of
+scientific datasets due in part to the unique difficulties of tokenizing
+numbers. We propose xVal, a numerical encoding scheme that represents any real
+number using just a single token. xVal represents a given real number by
+scaling a dedicated embedding vector by the number value. Combined with a
+modified number-inference approach, this strategy renders the model end-to-end
+continuous when considered as a map from the numbers of the input string to
+those of the output string. This leads to an inductive bias that is generally
+more suitable for applications in scientific domains. We empirically evaluate
+our proposal on a number of synthetic and real-world datasets. Compared with
+existing number encoding schemes, we find that xVal is more token-efficient and
+demonstrates improved generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages 7 figures. Supplementary: 5 pages 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Laws for Associative Memories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivien Cabannes, Elvis Dohmatob, Alberto Bietti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning arguably involves the discovery and memorization of abstract rules.
+The aim of this paper is to study associative memory mechanisms. Our model is
+based on high-dimensional matrices consisting of outer products of embeddings,
+which relates to the inner layers of transformer language models. We derive
+precise scaling laws with respect to sample size and parameter size, and
+discuss the statistical efficiency of different estimators, including
+optimization-based algorithms. We provide extensive numerical experiments to
+validate and interpret theoretical results, including fine-grained
+visualizations of the stored memory associations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Never Train from Scratch: Fair Comparison of Long-Sequence Models
+  Requires Data-Driven Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ido Amos, Jonathan Berant, Ankit Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling long-range dependencies across sequences is a longstanding goal in
+machine learning and has led to architectures, such as state space models, that
+dramatically outperform Transformers on long sequences. However, these
+impressive empirical gains have been by and large demonstrated on benchmarks
+(e.g. Long Range Arena), where models are randomly initialized and trained to
+predict a target label from an input sequence. In this work, we show that
+random initialization leads to gross overestimation of the differences between
+architectures and that pretraining with standard denoising objectives, using
+$\textit{only the downstream task data}$, leads to dramatic gains across
+multiple architectures and to very small gaps between Transformers and state
+space models (SSMs). In stark contrast to prior works, we find vanilla
+Transformers to match the performance of S4 on Long Range Arena when properly
+pretrained, and we improve the best reported results of SSMs on the PathX-256
+task by 20 absolute points. Subsequently, we analyze the utility of
+previously-proposed structured parameterizations for SSMs and show they become
+mostly redundant in the presence of data-driven initialization obtained through
+pretraining. Our work shows that, when evaluating different architectures on
+supervised tasks, incorporation of data-driven priors via pretraining is
+essential for reliable performance estimation, and can be done efficiently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze He, Yushi Bai, Matthieu Lin, Wang Zhao, Yubin Hu, Jenny Sheng, Ran Yi, Juanzi Li, Yong-Jin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent methods in text-to-3D leverage powerful pretrained diffusion models to
+optimize NeRF. Notably, these methods are able to produce high-quality 3D
+scenes without training on 3D data. Due to the open-ended nature of the task,
+most studies evaluate their results with subjective case studies and user
+experiments, thereby presenting a challenge in quantitatively addressing the
+question: How has current progress in Text-to-3D gone so far? In this paper, we
+introduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing
+diverse text prompts of three increasing complexity levels that are specially
+designed for 3D generation. To assess both the subjective quality and the text
+alignment, we propose two automatic metrics based on multi-view images produced
+by the 3D contents. The quality metric combines multi-view text-image scores
+and regional convolution to detect quality and view inconsistency. The
+alignment metric uses multi-view captioning and Large Language Model (LLM)
+evaluation to measure text-3D consistency. Both metrics closely correlate with
+different dimensions of human judgments, providing a paradigm for efficiently
+evaluating text-to-3D models. The benchmarking results, shown in Fig. 1, reveal
+performance differences among six prevalent text-to-3D methods. Our analysis
+further highlights the common struggles for current methods on generating
+surroundings and multi-object scenes, as well as the bottleneck of leveraging
+2D guidance for 3D generation. Our project page is available at:
+https://t3bench.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniverSLU: Universal Spoken Language Understanding for Diverse
+  Classification and Sequence Generation Tasks with a Single Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddhant Arora, Hayato Futami, Jee-weon Jung, Yifan Peng, Roshan Sharma, Yosuke Kashiwagi, Emiru Tsunoo, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have demonstrated promising outcomes by employing large
+language models with multi-tasking capabilities. They utilize prompts to guide
+the model's behavior and surpass performance of task-specific models. Motivated
+by this, we ask: can we build a single model that jointly perform various
+spoken language understanding (SLU) tasks? To address this, we utilize
+pre-trained automatic speech recognition (ASR) models and employ various task
+and dataset specifiers as discrete prompts. We demonstrate efficacy of our
+single multi-task learning (MTL) model "UniverSLU" for 12 different speech
+classification and sequence generation tasks across 17 datasets and 9
+languages. Results show that UniverSLU achieves competitive performance and
+even surpasses task-specific models. We also conduct preliminary investigations
+into enabling human-interpretable natural phrases instead of task specifiers as
+discrete prompts and test the model's generalization capabilities to new
+paraphrases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>ing and Adapter Tuning for <span class="highlight-title">Self-supervised</span> Encoder-Decoder Speech
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai-Wei Chang, Ming-Hsin Chen, Yun-Ping Lin, Jing Neng Hsu, Paul Kuo-Ming Huang, Chien-yu Huang, Shang-Wen Li, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompting and adapter tuning have emerged as efficient alternatives to
+fine-tuning (FT) methods. However, existing studies on speech prompting focused
+on classification tasks and failed on more complex sequence generation tasks.
+Besides, adapter tuning is primarily applied with a focus on encoder-only
+self-supervised models. Our experiments show that prompting on Wav2Seq, a
+self-supervised encoder-decoder model, surpasses previous works in sequence
+generation tasks. It achieves a remarkable 53% relative improvement in word
+error rate for ASR and a 27% in F1 score for slot filling. Additionally,
+prompting competes with the FT method in the low-resource scenario. Moreover,
+we show the transferability of prompting and adapter tuning on Wav2Seq in
+cross-lingual ASR. When limited trainable parameters are involved, prompting
+and adapter tuning consistently outperform conventional FT across 7 languages.
+Notably, in the low-resource scenario, prompting consistently outperforms
+adapter tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DQ-LoRe: Dual Queries with Low Rank Approximation Re-ranking for
+  In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiong Xiong, Zixuan Li, Chuanyang Zheng, Zhijiang Guo, Yichun Yin, Enze Xie, Zhicheng Yang, Qingxing Cao, Haiming Wang, Xiongwei Han, Jing Tang, Chengming Li, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in natural language processing, primarily propelled by Large
+Language Models (LLMs), have showcased their remarkable capabilities grounded
+in in-context learning. A promising avenue for guiding LLMs in intricate
+reasoning tasks involves the utilization of intermediate reasoning steps within
+the Chain-of-Thought (CoT) paradigm. Nevertheless, the central challenge lies
+in the effective selection of exemplars for facilitating in-context learning.
+In this study, we introduce a framework that leverages Dual Queries and
+Low-rank approximation Re-ranking (DQ-LoRe) to automatically select exemplars
+for in-context learning. Dual Queries first query LLM to obtain LLM-generated
+knowledge such as CoT, then query the retriever to obtain the final exemplars
+via both question and the knowledge. Moreover, for the second query, LoRe
+employs dimensionality reduction techniques to refine exemplar selection,
+ensuring close alignment with the input question's knowledge. Through extensive
+experiments, we demonstrate that DQ-LoRe significantly outperforms prior
+state-of-the-art methods in the automatic selection of exemplars for GPT-4,
+enhancing performance from 92.5\% to 94.2\%. Our comprehensive analysis further
+reveals that DQ-LoRe consistently outperforms retrieval-based approaches in
+terms of both performance and adaptability, especially in scenarios
+characterized by distribution shifts. DQ-LoRe pushes the boundaries of
+in-context learning and opens up new avenues for addressing complex reasoning
+challenges. We will release the code soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JsonTuning: Towards Generalizable, Robust, and Controllable Instruction
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Gao, Wenxuan Zhang, Guizhen Chen, Wai Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning has emerged as a crucial process for harnessing the
+capabilities of large language models (LLMs) by providing explicit task
+instructions, leading to improved performance in various tasks. However,
+prevalent text-to-text instruction tuning (TextTuning) methods suffer from
+limitations in generalization, robustness, and controllability due to the
+ambiguity and lack of explicit structure in tasks. In this paper, we propose
+JsonTuning, a novel structure-to-structure approach for instruction tuning. By
+leveraging the versatility and structured nature of JSON to represent tasks,
+JsonTuning enhances generalization by helping the model understand essential
+task elements and their relations, improves robustness by minimizing ambiguity,
+and increases controllability by providing explicit control over the output. We
+conduct a comprehensive comparative study with diverse language models and
+evaluation benchmarks. Experimental results show that JsonTuning outperforms
+TextTuning in various applications, showcasing improved performance,
+adaptability, robustness, and controllability. By overcoming the limitations of
+TextTuning, JsonTuning demonstrates significant potential for more effective
+and reliable LLMs capable of handling diverse scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shadow Alignment: The Ease of Subverting Safely-Aligned Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianjun Yang, Xiao Wang, Qi Zhang, Linda Petzold, William Yang Wang, Xun Zhao, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Warning: This paper contains examples of harmful language, and reader
+discretion is recommended. The increasing open release of powerful large
+language models (LLMs) has facilitated the development of downstream
+applications by reducing the essential cost of data annotation and computation.
+To ensure AI safety, extensive safety-alignment measures have been conducted to
+armor these models against malicious use (primarily hard prompt attack).
+However, beneath the seemingly resilient facade of the armor, there might lurk
+a shadow. By simply tuning on 100 malicious examples with 1 GPU hour, these
+safely aligned LLMs can be easily subverted to generate harmful content.
+Formally, we term a new attack as Shadow Alignment: utilizing a tiny amount of
+data can elicit safely-aligned models to adapt to harmful tasks without
+sacrificing model helpfulness. Remarkably, the subverted models retain their
+capability to respond appropriately to regular inquiries. Experiments across 8
+models released by 5 different organizations (LLaMa-2, Falcon, InternLM,
+BaiChuan2, Vicuna) demonstrate the effectiveness of shadow alignment attack.
+Besides, the single-turn English-only attack successfully transfers to
+multi-turn dialogue and other languages. This study serves as a clarion call
+for a collective effort to overhaul and fortify the safety of open-source LLMs
+against malicious attackers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LibriSpeech-PC: Benchmark for Evaluation of Punctuation and
+  Capitalization Capabilities of end-to-end ASR Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandr Meister, Matvei Novikov, Nikolay Karpov, Evelina Bakhturina, Vitaly Lavrukhin, Boris Ginsburg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional automatic speech recognition (ASR) models output lower-cased
+words without punctuation marks, which reduces readability and necessitates a
+subsequent text processing model to convert ASR transcripts into a proper
+format. Simultaneously, the development of end-to-end ASR models capable of
+predicting punctuation and capitalization presents several challenges,
+primarily due to limited data availability and shortcomings in the existing
+evaluation methods, such as inadequate assessment of punctuation prediction. In
+this paper, we introduce a LibriSpeech-PC benchmark designed to assess the
+punctuation and capitalization prediction capabilities of end-to-end ASR
+models. The benchmark includes a LibriSpeech-PC dataset with restored
+punctuation and capitalization, a novel evaluation metric called Punctuation
+Error Rate (PER) that focuses on punctuation marks, and initial baseline
+models. All code, data, and models are publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Large Language Models on Climate Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannis Bulian, Mike S. Schäfer, Afra Amini, Heidi Lam, Massimiliano Ciaramita, Ben Gaiarin, Michelle Chen Huebscher, Christian Buck, Niels Mede, Markus Leippold, Nadine Strauss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how climate change affects us and learning about available
+solutions are key steps toward empowering individuals and communities to
+mitigate and adapt to it. As Large Language Models (LLMs) rise in popularity,
+it is necessary to assess their capability in this domain. In this study, we
+present a comprehensive evaluation framework, grounded in science communication
+principles, to analyze LLM responses to climate change topics. Our framework
+emphasizes both the presentational and epistemological adequacy of answers,
+offering a fine-grained analysis of LLM generations. Spanning 8 dimensions, our
+framework discerns up to 30 distinct issues in model outputs. The task is a
+real-world example of a growing number of challenging problems where AI can
+complement and lift human performance. We introduce a novel and practical
+protocol for scalable oversight that uses AI Assistance and relies on raters
+with relevant educational backgrounds. We evaluate several recent LLMs and
+conduct a comprehensive analysis of the results, shedding light on both the
+potential and the limitations of LLMs in the realm of climate communication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hate Speech Detection in Limited Data Contexts using Synthetic Data
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aman Khullar, Daniel Nkemelu, Cuong V. Nguyen, Michael L. Best
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A growing body of work has focused on text classification methods for
+detecting the increasing amount of hate speech posted online. This progress has
+been limited to only a select number of highly-resourced languages causing
+detection systems to either under-perform or not exist in limited data
+contexts. This is majorly caused by a lack of training data which is expensive
+to collect and curate in these settings. In this work, we propose a data
+augmentation approach that addresses the problem of lack of data for online
+hate speech detection in limited data contexts using synthetic data generation
+techniques. Given a handful of hate speech examples in a high-resource language
+such as English, we present three methods to synthesize new examples of hate
+speech data in a target language that retains the hate sentiment in the
+original examples but transfers the hate targets. We apply our approach to
+generate training data for hate speech classification tasks in Hindi and
+Vietnamese. Our findings show that a model trained on synthetic data performs
+comparably to, and in some cases outperforms, a model trained only on the
+samples available in the target domain. This method can be adopted to bootstrap
+hate speech detection models from scratch in limited data contexts. As the
+growth of social media within these contexts continues to outstrip response
+efforts, this work furthers our capacities for detection, understanding, and
+response to hate speech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Journal on Computing and Sustainable Societies</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sweeping Heterogeneity with Smart MoPs: Mixture of <span class="highlight-title">Prompt</span>s for LLM Task
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Dun, Mirian Del Carmen Hipolito Garcia, Guoqing Zheng, Ahmed Hassan Awadallah, Anastasios Kyrillidis, Robert Sim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have the ability to solve a variety of tasks,
+such as text summarization and mathematical questions, just out of the box, but
+they are often trained with a single task in mind. Due to high computational
+costs, the current trend is to use prompt instruction tuning to better adjust
+monolithic, pretrained LLMs for new -- but often individual -- downstream
+tasks. Thus, how one would expand prompt tuning to handle -- concomitantly --
+heterogeneous tasks and data distributions is a widely open question. To
+address this gap, we suggest the use of \emph{Mixture of Prompts}, or MoPs,
+associated with smart gating functionality: the latter -- whose design is one
+of the contributions of this paper -- can identify relevant skills embedded in
+different groups of prompts and dynamically assign combined experts (i.e.,
+collection of prompts), based on the target task. Additionally, MoPs are
+empirically agnostic to any model compression technique applied -- for
+efficiency reasons -- as well as instruction data source and task composition.
+In practice, MoPs can simultaneously mitigate prompt training "interference" in
+multi-task, multi-source scenarios (e.g., task and data heterogeneity across
+sources), as well as possible implications from model approximations. As a
+highlight, MoPs manage to decrease final perplexity from $\sim20\%$ up to
+$\sim70\%$, as compared to baselines, in the federated scenario, and from $\sim
+3\%$ up to $\sim30\%$ in the centralized scenario.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOMINO: A Dual-System for Multi-step Visual Language Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peifang Wang, Olga Golovneva, Armen Aghajanyan, Xiang Ren, Muhao Chen, Asli Celikyilmaz, Maryam Fazel-Zarandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual language reasoning requires a system to extract text or numbers from
+information-dense images like charts or plots and perform logical or arithmetic
+reasoning to arrive at an answer. To tackle this task, existing work relies on
+either (1) an end-to-end vision-language model trained on a large amount of
+data, or (2) a two-stage pipeline where a captioning model converts the image
+into text that is further read by another large language model to deduce the
+answer. However, the former approach forces the model to answer a complex
+question with one single step, and the latter approach is prone to inaccurate
+or distracting information in the converted text that can confuse the language
+model. In this work, we propose a dual-system for multi-step multimodal
+reasoning, which consists of a "System-1" step for visual information
+extraction and a "System-2" step for deliberate reasoning. Given an input,
+System-2 breaks down the question into atomic sub-steps, each guiding System-1
+to extract the information required for reasoning from the image. Experiments
+on chart and plot datasets show that our method with a pre-trained System-2
+module performs competitively compared to prior work on in- and
+out-of-distribution data. By fine-tuning the System-2 module (LLaMA-2 70B) on
+only a small amount of data on multi-step reasoning, the accuracy of our method
+is further improved and surpasses the best fully-supervised end-to-end approach
+by 5.7% and a pipeline approach with FlanPaLM (540B) by 7.5% on a challenging
+dataset with human-authored questions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low Resource Summarization using <span class="highlight-title">Pre-train</span>ed Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mubashir Munaf, Hammad Afzal, Naima Iltaf, Khawir Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advent of Deep Learning based Artificial Neural Networks models,
+Natural Language Processing (NLP) has witnessed significant improvements in
+textual data processing in terms of its efficiency and accuracy. However, the
+research is mostly restricted to high-resource languages such as English and
+low-resource languages still suffer from a lack of available resources in terms
+of training datasets as well as models with even baseline evaluation results.
+Considering the limited availability of resources for low-resource languages,
+we propose a methodology for adapting self-attentive transformer-based
+architecture models (mBERT, mT5) for low-resource summarization, supplemented
+by the construction of a new baseline dataset (76.5k article, summary pairs) in
+a low-resource language Urdu. Choosing news (a publicly available source) as
+the application domain has the potential to make the proposed methodology
+useful for reproducing in other languages with limited resources. Our adapted
+summarization model \textit{urT5} with up to 44.78\% reduction in size as
+compared to \textit{mT5} can capture contextual information of low resource
+language effectively with evaluation score (up to 46.35 ROUGE-1, 77 BERTScore)
+at par with state-of-the-art models in high resource language English
+\textit{(PEGASUS: 47.21, BART: 45.14 on XSUM Dataset)}. The proposed method
+provided a baseline approach towards extractive as well as abstractive
+summarization with competitive evaluation results in a limited resource setup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A UMLS-Augmented Framework for Improving Factuality in Large Language
+  Models within Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Edison Marrese-Taylor, Yuhe Ke, Lechao Cheng, Qingyu Chen, Irene Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated powerful text generation
+capabilities, bringing unprecedented innovation to the healthcare field. While
+LLMs hold immense promise for applications in healthcare, applying them to real
+clinical scenarios presents significant challenges, as these models may
+generate content that deviates from established medical facts and even exhibit
+potential biases. In our research, we develop an augmented LLM framework based
+on the Unified Medical Language System (UMLS), aiming to better serve the
+healthcare community. We employ LLaMa2-13b-chat and ChatGPT-3.5 as our
+benchmark models, and conduct automatic evaluations using the ROUGE Score and
+BERTScore on 104 questions from the LiveQA test set. Additionally, we establish
+criteria for physician-evaluation based on four dimensions: Factuality,
+Completeness, Readability and Relevancy. ChatGPT-3.5 is used for physician
+evaluation with 20 questions on the LiveQA test set. Multiple resident
+physicians conducted blind reviews to evaluate the generated content, and the
+results indicate that this framework effectively enhances the factuality,
+completeness, and relevance of generated content. Our research demonstrates the
+effectiveness of using UMLS-augmented LLMs and highlights the potential
+application value of LLMs in in medical question-answering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Role of Linguistic Priors in Measuring Compositional Generalization
+  of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenwei Wu, Li Erran Li, Stefano Ermon, Patrick Haffner, Rong Ge, Zaiwei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositionality is a common property in many modalities including natural
+languages and images, but the compositional generalization of multi-modal
+models is not well-understood. In this paper, we identify two sources of
+visual-linguistic compositionality: linguistic priors and the interplay between
+images and texts. We show that current attempts to improve compositional
+generalization rely on linguistic priors rather than on information in the
+image. We also propose a new metric for compositionality without such
+linguistic priors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Study and Framework for Automated Summariser Evaluation:
+  LangChain and Hybrid Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bagiya Lakshmi S, Sanjjushri Varshini R, Rohith Mahadevan, Raja CSP Raman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Essay Score (AES) is proven to be one of the cutting-edge
+technologies. Scoring techniques are used for various purposes. Reliable scores
+are calculated based on influential variables. Such variables can be computed
+by different methods based on the domain. The research is concentrated on the
+user's understanding of a given topic. The analysis is based on a scoring index
+by using Large Language Models. The user can then compare and contrast the
+understanding of a topic that they recently learned. The results are then
+contributed towards learning analytics and progression is made for enhancing
+the learning ability. In this research, the focus is on summarizing a PDF
+document and gauging a user's understanding of its content. The process
+involves utilizing a Langchain tool to summarize the PDF and extract the
+essential information. By employing this technique, the research aims to
+determine how well the user comprehends the summarized content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LC-Score: Reference-less estimation of Text Comprehension Difficulty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Tardy, Charlotte Roze, Paul Poupet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to read and understand written text is critical in a digital era.
+However, studies shows that a large fraction of the population experiences
+comprehension issues. In this context, further initiatives in accessibility are
+required to improve the audience text comprehension. However, writers are
+hardly assisted nor encouraged to produce easy-to-understand content. Moreover,
+Automatic Text Simplification (ATS) model development suffers from the lack of
+metric to accurately estimate comprehension difficulty We present
+\textsc{LC-Score}, a simple approach for training text comprehension metric for
+any French text without reference \ie predicting how easy to understand a given
+text is on a $[0, 100]$ scale. Our objective with this scale is to
+quantitatively capture the extend to which a text suits to the \textit{Langage
+Clair} (LC, \textit{Clear Language}) guidelines, a French initiative closely
+related to English Plain Language. We explore two approaches: (i) using
+linguistically motivated indicators used to train statistical models, and (ii)
+neural learning directly from text leveraging pre-trained language models. We
+introduce a simple proxy task for comprehension difficulty training as a
+classification task. To evaluate our models, we run two distinct human
+annotation experiments, and find that both approaches (indicator based and
+neural) outperforms commonly used readability and comprehension metrics such as
+FKGL and SAMSA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AGIR: Automating Cyber Threat Intelligence Reporting with Natural
+  Language Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filippo Perrina, Francesco Marchiori, Mauro Conti, Nino Vincenzo Verde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cyber Threat Intelligence (CTI) reporting is pivotal in contemporary risk
+management strategies. As the volume of CTI reports continues to surge, the
+demand for automated tools to streamline report generation becomes increasingly
+apparent. While Natural Language Processing techniques have shown potential in
+handling text data, they often struggle to address the complexity of diverse
+data sources and their intricate interrelationships. Moreover, established
+paradigms like STIX have emerged as de facto standards within the CTI
+community, emphasizing the formal categorization of entities and relations to
+facilitate consistent data sharing. In this paper, we introduce AGIR (Automatic
+Generation of Intelligence Reports), a transformative Natural Language
+Generation tool specifically designed to address the pressing challenges in the
+realm of CTI reporting. AGIR's primary objective is to empower security
+analysts by automating the labor-intensive task of generating comprehensive
+intelligence reports from formal representations of entity graphs. AGIR
+utilizes a two-stage pipeline by combining the advantages of template-based
+approaches and the capabilities of Large Language Models such as ChatGPT. We
+evaluate AGIR's report generation capabilities both quantitatively and
+qualitatively. The generated reports accurately convey information expressed
+through formal language, achieving a high recall value (0.99) without
+introducing hallucination. Furthermore, we compare the fluency and utility of
+the reports with state-of-the-art approaches, showing how AGIR achieves higher
+scores in terms of Syntactic Log-Odds Ratio (SLOR) and through questionnaires.
+By using our tool, we estimate that the report writing time is reduced by more
+than 40%, therefore streamlining the CTI production of any organization and
+contributing to the automation of several CTI tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I$^2$KD-SLU: An Intra-Inter Knowledge Distillation Framework for
+  Zero-Shot Cross-Lingual Spoken Language Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianjun Mao, Chenghong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken language understanding (SLU) typically includes two subtasks: intent
+detection and slot filling. Currently, it has achieved great success in
+high-resource languages, but it still remains challenging in low-resource
+languages due to the scarcity of labeled training data. Hence, there is a
+growing interest in zero-shot cross-lingual SLU. Despite of the success of
+existing zero-shot cross-lingual SLU models, most of them neglect to achieve
+the mutual guidance between intent and slots. To address this issue, we propose
+an Intra-Inter Knowledge Distillation framework for zero-shot cross-lingual
+Spoken Language Understanding (I$^2$KD-SLU) to model the mutual guidance.
+Specifically, we not only apply intra-knowledge distillation between intent
+predictions or slot predictions of the same utterance in different languages,
+but also apply inter-knowledge distillation between intent predictions and slot
+predictions of the same utterance. Our experimental results demonstrate that
+our proposed framework significantly improves the performance compared with the
+strong baselines and achieves the new state-of-the-art performance on the
+MultiATIS++ dataset, obtaining a significant improvement over the previous best
+model in overall accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Automatic VQA Evaluation Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Mañas, Benno Krojer, Aishwarya Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  8 years after the visual question answering (VQA) task was proposed, accuracy
+remains the primary metric for automatic evaluation. VQA Accuracy has been
+effective so far in the IID evaluation setting. However, our community is
+undergoing a shift towards open-ended generative models and OOD evaluation. In
+this new paradigm, the existing VQA Accuracy metric is overly stringent and
+underestimates the performance of VQA systems. Thus, there is a need to develop
+more robust automatic VQA metrics that serve as a proxy for human judgment. In
+this work, we propose to leverage the in-context learning capabilities of
+instruction-tuned large language models (LLMs) to build a better VQA metric. We
+formulate VQA evaluation as an answer-rating task where the LLM is instructed
+to score the accuracy of a candidate answer given a set of reference answers.
+We demonstrate the proposed metric better correlates with human judgment
+compared to existing metrics across several VQA models and benchmarks. We hope
+wide adoption of our metric will contribute to better estimating the research
+progress on the VQA task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NOLA: Networks as Linear Combination of Low Rank Random Basis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Abbasi Koohpayegani, KL Navaneet, Parsa Nooralinejad, Soheil Kolouri, Hamed Pirsiavash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently gained popularity due to their
+impressive few-shot performance across various downstream tasks. However,
+fine-tuning all parameters and storing a unique model for each downstream task
+or domain becomes impractical because of the massive size of checkpoints (e.g.,
+350GB in GPT-3). Current literature, such as LoRA, showcases the potential of
+low-rank modifications to the original weights of an LLM, enabling efficient
+adaptation and storage for task-specific models. These methods can reduce the
+number of parameters needed to fine-tune an LLM by several orders of magnitude.
+Yet, these methods face two primary limitations: 1) the parameter reduction is
+lower-bounded by the rank one decomposition, and 2) the extent of reduction is
+heavily influenced by both the model architecture and the chosen rank. For
+instance, in larger models, even a rank one decomposition might exceed the
+number of parameters truly needed for adaptation. In this paper, we introduce
+NOLA, which overcomes the rank one lower bound present in LoRA. It achieves
+this by re-parameterizing the low-rank matrices in LoRA using linear
+combinations of randomly generated matrices (basis) and optimizing the linear
+mixture coefficients only. This approach allows us to decouple the number of
+trainable parameters from both the choice of rank and the network architecture.
+We present adaptation results using GPT-2 and ViT in natural language and
+computer vision tasks. NOLA performs as well as, or better than models with
+equivalent parameter counts. Furthermore, we demonstrate that we can halve the
+parameters in larger models compared to LoRA with rank one, without sacrificing
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is available here: https://github.com/UCDvision/NOLA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CITING: Large Language Models Create Curriculum for Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Feng, Zifeng Wang, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancement of large language models (LLMs) has been achieved
+through a combo of instruction tuning and human alignment. However, building
+manually crafted instruction datasets and performing human alignment become the
+bottleneck for scaling the development of LLMs. In this paper, we exploit the
+idea of leveraging AI models in lieu of humans as the teacher to train student
+LLMs. Our method is inspired by how human students refine their writing skills
+by following the rubrics and learning from the revisions offered by their
+tutors. Specifically, we employ a teacher LLM to create a curriculum for
+instruction tuning of the student LLM, namely Curriculum Instruction TunING
+(CITING). It encompasses two main steps: (1) the teacher LLM crafts the rubrics
+for evaluating the answers corresponding to various types of questions, and (2)
+the student LLM learns to follow the rubrics and perform self-correction from
+the revision made by the teacher. We further iteratively carry out it to embody
+the procedure of CITING. We compare CITING to a series of state-of-the-art
+baselines on four datasets. Our method demonstrates strong improvement in terms
+of articulate, in-depth, and comprehensive by GPT-4 evaluation. Specifically,
+it achieves an average winning rate of 79.4% over SFT, 73.4% over RLHF, 78.1%
+over RRHF, and 76.3% over RAFT, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Performance of Multimodal Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Utsav Garg, Erhan Bas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-tuned large language models (LLMs) have demonstrated promising
+zero-shot generalization capabilities across various downstream tasks. Recent
+research has introduced multimodal capabilities to LLMs by integrating
+independently pretrained vision encoders through model grafting. These
+multimodal variants undergo instruction tuning, similar to LLMs, enabling
+effective zero-shot generalization for multimodal tasks. This study conducts a
+comparative analysis of different multimodal instruction tuning approaches and
+evaluates their performance across a range of tasks, including complex
+reasoning, conversation, image captioning, multiple-choice questions (MCQs),
+and binary classification. Through rigorous benchmarking and ablation
+experiments, we reveal key insights for guiding architectural choices when
+incorporating multimodal capabilities into LLMs. However, current approaches
+have limitations; they do not sufficiently address the need for a diverse
+multimodal instruction dataset, which is crucial for enhancing task
+generalization. Additionally, they overlook issues related to truthfulness and
+factuality when generating responses. These findings illuminate current
+methodological constraints in adapting language models for image comprehension
+and provide valuable guidance for researchers and practitioners seeking to
+harness multimodal versions of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Language Models Employ the Socratic Method? Experiments with Code
+  Debugging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erfan Al-Hossami, Razvan Bunescu, Justin Smith, Ryan Teehan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When employing the Socratic method of teaching, instructors guide students
+toward solving a problem on their own rather than providing the solution
+directly. While this strategy can substantially improve learning outcomes, it
+is usually time-consuming and cognitively demanding. Automated Socratic
+conversational agents can augment human instruction and provide the necessary
+scale, however their development is hampered by the lack of suitable data for
+training and evaluation. In this paper, we introduce a manually created dataset
+of multi-turn Socratic advice that is aimed at helping a novice programmer fix
+buggy solutions to simple computational problems. The dataset is then used for
+benchmarking the Socratic debugging abilities of a number of language models,
+ranging from fine-tuning the instruction-based text-to-text transformer Flan-T5
+to zero-shot and chain of thought prompting of the much larger GPT-4. The code
+and datasets are made freely available for research at the link below.
+https://github.com/taisazero/socratic-debugging-benchmark
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 tables. To be published in Proceedings of the 2024
+  Technical Symposium on Computer Science Education (SIGCSE'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Rise of Open Science: Tracking the Evolution and Perceived Value of
+  Data and Methods Link-Sharing Practices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hancheng Cao, Jesse Dodge, Kyle Lo, Daniel A. McFarland, Lucy Lu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, funding agencies and journals increasingly advocate for open
+science practices (e.g. data and method sharing) to improve the transparency,
+access, and reproducibility of science. However, quantifying these practices at
+scale has proven difficult. In this work, we leverage a large-scale dataset of
+1.1M papers from arXiv that are representative of the fields of physics, math,
+and computer science to analyze the adoption of data and method link-sharing
+practices over time and their impact on article reception. To identify links to
+data and methods, we train a neural text classification model to automatically
+classify URL types based on contextual mentions in papers. We find evidence
+that the practice of link-sharing to methods and data is spreading as more
+papers include such URLs over time. Reproducibility efforts may also be
+spreading because the same links are being increasingly reused across papers
+(especially in computer science); and these links are increasingly concentrated
+within fewer web domains (e.g. Github) over time. Lastly, articles that share
+data and method links receive increased recognition in terms of citation count,
+with a stronger effect when the shared links are active (rather than defunct).
+Together, these findings demonstrate the increased spread and perceived value
+of data and method sharing practices in open science.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval-augmented Generation to Improve Math Question-Answering:
+  Trade-offs Between Groundedness and Human Preference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Levonian, Chenglu Li, Wangda Zhu, Anoushka Gade, Owen Henkel, Millie-Ellen Postle, Wanli Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For middle-school math students, interactive question-answering (QA) with
+tutors is an effective way to learn. The flexibility and emergent capabilities
+of generative large language models (LLMs) has led to a surge of interest in
+automating portions of the tutoring process - including interactive QA to
+support conceptual discussion of mathematical concepts. However, LLM responses
+to math questions can be incorrect or mismatched to the educational context -
+such as being misaligned with a school's curriculum. One potential solution is
+retrieval-augmented generation (RAG), which involves incorporating a vetted
+external knowledge source in the LLM prompt to increase response quality. In
+this paper, we designed prompts that retrieve and use content from a
+high-quality open-source math textbook to generate responses to real student
+questions. We evaluate the efficacy of this RAG system for middle-school
+algebra and geometry QA by administering a multi-condition survey, finding that
+humans prefer responses generated using RAG, but not when responses are too
+grounded in the textbook content. We argue that while RAG is able to improve
+response quality, designers of math QA systems must consider trade-offs between
+generating responses preferred by students and responses closely matched to
+specific educational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust and Interpretable Medical Image Classifiers via Concept
+  Bottleneck Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Yan, Yu Wang, Yiwu Zhong, Zexue He, Petros Karypis, Zihan Wang, Chengyu Dong, Amilcare Gentili, Chun-Nan Hsu, Jingbo Shang, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification is a critical problem for healthcare, with the
+potential to alleviate the workload of doctors and facilitate diagnoses of
+patients. However, two challenges arise when deploying deep learning models to
+real-world healthcare applications. First, neural models tend to learn spurious
+correlations instead of desired features, which could fall short when
+generalizing to new domains (e.g., patients with different ages). Second, these
+black-box models lack interpretability. When making diagnostic predictions, it
+is important to understand why a model makes a decision for trustworthy and
+safety considerations. In this paper, to address these two limitations, we
+propose a new paradigm to build robust and interpretable medical image
+classifiers with natural language concepts. Specifically, we first query
+clinical concepts from GPT-4, then transform latent image features into
+explicit concepts with a vision-language model. We systematically evaluate our
+method on eight medical image classification datasets to verify its
+effectiveness. On challenging datasets with strong confounding factors, our
+method can mitigate spurious correlations thus substantially outperform
+standard visual encoders and other baselines. Finally, we show how
+classification with a small number of concepts brings a level of
+interpretability for understanding model decisions through case studies in real
+medical data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\mathcal{B}$-Coder: Value-Based Deep Reinforcement Learning for Program
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zishun Yu, Yunzhe Tao, Liyu Chen, Tao Sun, Hongxia Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Program synthesis aims to create accurate, executable code from natural
+language descriptions. This field has leveraged the power of reinforcement
+learning (RL) in conjunction with large language models (LLMs), significantly
+enhancing code generation capabilities. This integration focuses on directly
+optimizing functional correctness, transcending conventional supervised losses.
+While current literature predominantly favors policy-based algorithms,
+attributes of program synthesis suggest a natural compatibility with
+value-based methods. This stems from rich collection of off-policy programs
+developed by human programmers, and the straightforward verification of
+generated programs through automated unit testing (i.e. easily obtainable
+rewards in RL language). Diverging from the predominant use of policy-based
+algorithms, our work explores the applicability of value-based approaches,
+leading to the development of our $\mathcal{B}$-Coder (pronounced Bellman
+coder). Yet, training value-based methods presents challenges due to the
+enormous search space inherent to program synthesis. To this end, we propose an
+initialization protocol for RL agents utilizing pre-trained LMs and a
+conservative Bellman operator to reduce training complexities. Moreover, we
+demonstrate how to leverage the learned value functions as a dual strategy to
+post-process generated programs. Our empirical evaluations demonstrated
+$\mathcal{B}$-Coder's capability in achieving state-of-the-art performance
+compared with policy-based methods. Remarkably, this achievement is reached
+with minimal reward engineering effort, highlighting the effectiveness of
+value-based RL, independent of reward designs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaTool Benchmark: Deciding Whether to Use Tools and Which to Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Huang, Jiawen Shi, Yuan Li, Chenrui Fan, Siyuan Wu, Qihui Zhang, Yixin Liu, Pan Zhou, Yao Wan, Neil Zhenqiang Gong, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have garnered significant attention due to their
+impressive natural language processing (NLP) capabilities. Recently, many
+studies have focused on the tool utilization ability of LLMs. They primarily
+investigated how LLMs effectively collaborate with given specific tools.
+However, in scenarios where LLMs serve as intelligent agents, as seen in
+applications like AutoGPT and MetaGPT, LLMs are expected to engage in intricate
+decision-making processes that involve deciding whether to employ a tool and
+selecting the most suitable tool(s) from a collection of available tools to
+fulfill user requests. Therefore, in this paper, we introduce MetaTool, a
+benchmark designed to evaluate whether LLMs have tool usage awareness and can
+correctly choose tools. Specifically, we create a dataset called ToolE within
+the benchmark. This dataset contains various types of user queries in the form
+of prompts that trigger LLMs to use tools, including both single-tool and
+multi-tool scenarios. Subsequently, we set the tasks for both tool usage
+awareness and tool selection. We define four subtasks from different
+perspectives in tool selection, including tool selection with similar choices,
+tool selection in specific scenarios, tool selection with possible reliability
+issues, and multi-tool selection. We conduct experiments involving nine popular
+LLMs and find that the majority of them still struggle to effectively select
+tools, highlighting the existing gaps between LLMs and genuine intelligent
+agents. However, through the error analysis, we found there is still
+significant room for improvement. Finally, we conclude with insights for tool
+developers that follow ChatGPT to provide detailed descriptions that can
+enhance the tool selection performance of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model Cascades with Mixture of Thoughts Representations
+  for Cost-efficient Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Murong Yue, Jie Zhao, Min Zhang, Liang Du, Ziyu Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as GPT-4 have exhibited remarkable
+performance in a variety of tasks, but this strong performance often comes with
+the high expense of using paid API services. In this paper, we are motivated to
+study building an LLM cascade to save the cost of using LLMs, particularly for
+performing reasoning (e.g., mathematical, causal) tasks. Our cascade pipeline
+follows the intuition that simpler questions can be addressed by a weaker but
+more affordable LLM, whereas only the challenging questions necessitate the
+stronger and more expensive LLM. To realize this decision-making, we consider
+the "answer consistency" of the weaker LLM as a signal of the question
+difficulty and propose several methods for the answer sampling and consistency
+checking, including one leveraging a mixture of two thought representations
+(i.e., Chain-of-Thought and Program-of-Thought). Through experiments on six
+reasoning benchmark datasets, with GPT-3.5-turbo and GPT-4 being the weaker and
+stronger LLMs, respectively, we demonstrate that our proposed LLM cascades can
+achieve performance comparable to using solely the stronger LLM but require
+only 40% of its cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering Knowledge-Critical Subnetworks in <span class="highlight-title">Pretrain</span>ed Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deniz Bayazit, Negar Foroutan, Zeming Chen, Gail Weiss, Antoine Bosselut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained language models (LMs) encode implicit representations of knowledge
+in their parameters. However, localizing these representations and
+disentangling them from each other remains an open problem. In this work, we
+investigate whether pretrained language models contain various
+knowledge-critical subnetworks: particular sparse computational subgraphs
+responsible for encoding specific knowledge the model has memorized. We propose
+a multi-objective differentiable weight masking scheme to discover these
+subnetworks and show that we can use them to precisely remove specific
+knowledge from models while minimizing adverse effects on the behavior of the
+original language model. We demonstrate our method on multiple GPT2 variants,
+uncovering highly sparse subnetworks (98%+) that are solely responsible for
+specific collections of relational knowledge. When these subnetworks are
+removed, the remaining network maintains most of its initial capacity (modeling
+language and other memorized relational knowledge) but struggles to express the
+removed knowledge, and suffers performance drops on examples needing this
+removed knowledge on downstream tasks after finetuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How FaR Are Large Language Models From Agents with Theory-of-Mind? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pei Zhou, Aman Madaan, Srividya Pranavi Potharaju, Aditya Gupta, Kevin R. McKee, Ari Holtzman, Jay Pujara, Xiang Ren, Swaroop Mishra, Aida Nematzadeh, Shyam Upadhyay, Manaal Faruqui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  "Thinking is for Doing." Humans can infer other people's mental states from
+observations--an ability called Theory-of-Mind (ToM)--and subsequently act
+pragmatically on those inferences. Existing question answering benchmarks such
+as ToMi ask models questions to make inferences about beliefs of characters in
+a story, but do not test whether models can then use these inferences to guide
+their actions. We propose a new evaluation paradigm for large language models
+(LLMs): Thinking for Doing (T4D), which requires models to connect inferences
+about others' mental states to actions in social scenarios. Experiments on T4D
+demonstrate that LLMs such as GPT-4 and PaLM 2 seemingly excel at tracking
+characters' beliefs in stories, but they struggle to translate this capability
+into strategic action. Our analysis reveals the core challenge for LLMs lies in
+identifying the implicit inferences about mental states without being
+explicitly asked about as in ToMi, that lead to choosing the correct action in
+T4D. To bridge this gap, we introduce a zero-shot prompting framework, Foresee
+and Reflect (FaR), which provides a reasoning structure that encourages LLMs to
+anticipate future challenges and reason about potential actions. FaR boosts
+GPT-4's performance from 50% to 71% on T4D, outperforming other prompting
+methods such as Chain-of-Thought and Self-Ask. Moreover, FaR generalizes to
+diverse out-of-distribution story structures and scenarios that also require
+ToM inferences to choose an action, consistently outperforming other methods
+including few-shot in-context learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, 18 pages, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Sense per Translation <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.06082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.06082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bradley Hauer, Grzegorz Kondrak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Word sense disambiguation (WSD) is the task of determining the sense of a
+word in context. Translations have been used in WSD as a source of knowledge,
+and even as a means of delimiting word senses. In this paper, we define three
+theoretical properties of the relationship between senses and translations, and
+argue that they constitute necessary conditions for using translations as sense
+inventories. The key property of One Sense per Translation (OSPT) provides a
+foundation for a translation-based WSD method. The results of an intrinsic
+evaluation experiment indicate that our method achieves a precision of
+approximately 93% compared to manual corpus annotations. Our extrinsic
+evaluation experiments demonstrate WSD improvements of up to 4.6% F1-score on
+difficult WSD datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published at IJCNLP-AACL 2023: The 13th International Joint
+  Conference on Natural Language Processing and the 3rd Conference of the
+  Asia-Pacific Chapter of the Association for Computational Linguistics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM Lies: Hallucinations are not Bugs, but Features as Adversarial
+  Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Yu Yao, Kun-Peng Ning, Zhen-Hui Liu, Mu-Nan Ning, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), including GPT-3.5, LLaMA, and PaLM, seem to be
+knowledgeable and able to adapt to many tasks. However, we still can not
+completely trust their answer, since LLMs suffer from
+hallucination--fabricating non-existent facts to cheat users without
+perception. And the reasons for their existence and pervasiveness remain
+unclear. In this paper, we demonstrate that non-sense prompts composed of
+random tokens can also elicit the LLMs to respond with hallucinations. This
+phenomenon forces us to revisit that hallucination may be another view of
+adversarial examples, and it shares similar features with conventional
+adversarial examples as the basic feature of LLMs. Therefore, we formalize an
+automatic hallucination triggering method as the hallucination attack in an
+adversarial way. Finally, we explore basic feature of attacked adversarial
+prompts and propose a simple yet effective defense strategy. Our code is
+released on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme
+  Long Sequence <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computation in a typical Transformer-based large language model (LLM) can be
+characterized by batch size, hidden dimension, number of layers, and sequence
+length. Until now, system works for accelerating LLM training have focused on
+the first three dimensions: data parallelism for batch size, tensor parallelism
+for hidden size and pipeline parallelism for model depth or layers. These
+widely studied forms of parallelism are not targeted or optimized for long
+sequence Transformer models. Given practical application needs for long
+sequence LLM, renewed attentions are being drawn to sequence parallelism.
+However, existing works in sequence parallelism are constrained by
+memory-communication inefficiency, limiting their scalability to long sequence
+large models. In this work, we introduce DeepSpeed-Ulysses, a novel, portable
+and effective methodology for enabling highly efficient and scalable LLM
+training with extremely long sequence length. DeepSpeed-Ulysses at its core
+partitions input data along the sequence dimension and employs an efficient
+all-to-all collective communication for attention computation. Theoretical
+communication analysis shows that whereas other methods incur communication
+overhead as sequence length increases, DeepSpeed-Ulysses maintains constant
+communication volume when sequence length and compute devices are increased
+proportionally. Furthermore, experimental evaluations show that
+DeepSpeed-Ulysses trains 2.5x faster with 4x longer sequence length than the
+existing method SOTA baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DNA-<span class="highlight-title">GPT</span>: Divergent N-Gram Analysis for Training-Free Detection of
+  <span class="highlight-title">GPT</span>-Generated Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17359v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17359v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianjun Yang, Wei Cheng, Yue Wu, Linda Petzold, William Yang Wang, Haifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have notably enhanced the fluency and diversity
+of machine-generated text. However, this progress also presents a significant
+challenge in detecting the origin of a given text, and current research on
+detection methods lags behind the rapid evolution of LLMs. Conventional
+training-based methods have limitations in flexibility, particularly when
+adapting to new domains, and they often lack explanatory power. To address this
+gap, we propose a novel training-free detection strategy called Divergent
+N-Gram Analysis (DNA-GPT). Given a text, we first truncate it in the middle and
+then use only the preceding portion as input to the LLMs to regenerate the new
+remaining parts. By analyzing the differences between the original and new
+remaining parts through N-gram analysis in black-box or probability divergence
+in white-box, we unveil significant discrepancies between the distribution of
+machine-generated text and the distribution of human-written text. We conducted
+extensive experiments on the most advanced LLMs from OpenAI, including
+text-davinci-003, GPT-3.5-turbo, and GPT-4, as well as open-source models such
+as GPT-NeoX-20B and LLaMa-13B. Results show that our zero-shot approach
+exhibits state-of-the-art performance in distinguishing between human and
+GPT-generated text on four English and one German dataset, outperforming
+OpenAI's own classifier, which is trained on millions of text. Additionally,
+our methods provide reasonable explanations and evidence to support our claim,
+which is a unique feature of explainable detection. Our method is also robust
+under the revised text attack and can additionally solve model sourcing. Codes
+are available at https://github.com/Xianjun-Yang/DNA-GPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updates</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ValiTex -- a unified validation framework for computational text-based
+  measures of social science constructs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02863v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02863v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Birkenmaier, Claudia Wagner, Clemens Lechner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guidance on how to validate computational text-based measures of social
+science constructs is fragmented. While scholars generally acknowledge the
+importance of validating their text-based measures, they often lack common
+terminology and a unified framework to do so. This paper introduces ValiTex, a
+new validation framework designed to assist scholars in validly measuring
+social science constructs based on textual data. ValiTex prescribes researchers
+to demonstrate three types of validity evidence: substantive evidence
+(outlining the theoretical underpinning of the measure), structural evidence
+(examining the properties of the text model and its output), and external
+evidence (testing for how the measure relates to independent information). In
+addition to the framework, ValiTex offers valuable practical guidance through a
+checklist that is adaptable for different use cases. The checklist clearly
+defines and outlines specific validation steps while also offering a
+knowledgeable evaluation of the importance of each validation step to establish
+validity. We demonstrate the utility of the framework by applying it to a use
+case of detecting sexism from social media data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying Uncertainty in Answers from any Language Model and Enhancing
+  their Trustworthiness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16175v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16175v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuhai Chen, Jonas Mueller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce BSDetector, a method for detecting bad and speculative answers
+from a pretrained Large Language Model by estimating a numeric confidence score
+for any output it generated. Our uncertainty quantification technique works for
+any LLM accessible only via a black-box API, whose training data remains
+unknown. By expending a bit of extra computation, users of any LLM API can now
+get the same response as they would ordinarily, as well as a confidence
+estimate that cautions when not to trust this response. Experiments on both
+closed and open-form Question-Answer benchmarks reveal that BSDetector more
+accurately identifies incorrect LLM responses than alternative uncertainty
+estimation procedures (for both GPT-3 and ChatGPT). By sampling multiple
+responses from the LLM and considering the one with the highest confidence
+score, we can additionally obtain more accurate responses from the same LLM,
+without any extra training steps. In applications involving automated
+evaluation with LLMs, accounting for our confidence scores leads to more
+reliable evaluation in both human-in-the-loop and fully-automated settings
+(across both GPT 3.5 and 4).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction Tuning for Large Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10792v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10792v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Zhang, Linfeng Dong, Xiaoya Li, Sen Zhang, Xiaofei Sun, Shuhe Wang, Jiwei Li, Runyi Hu, Tianwei Zhang, Fei Wu, Guoyin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper surveys research works in the quickly advancing field of
+instruction tuning (IT), a crucial technique to enhance the capabilities and
+controllability of large language models (LLMs). Instruction tuning refers to
+the process of further training LLMs on a dataset consisting of
+\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the
+gap between the next-word prediction objective of LLMs and the users' objective
+of having LLMs adhere to human instructions. In this work, we make a systematic
+review of the literature, including the general methodology of IT, the
+construction of IT datasets, the training of IT models, and applications to
+different modalities, domains and applications, along with an analysis on
+aspects that influence the outcome of IT (e.g., generation of instruction
+outputs, size of the instruction dataset, etc). We also review the potential
+pitfalls of IT along with criticism against it, along with efforts pointing out
+current deficiencies of existing strategies and suggest some avenues for
+fruitful research. Project page: github.com/xiaoya-li/Instruction-Tuning-Survey
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Survey paper, Pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fewer is More: Trojan Attacks on Parameter-Efficient Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lauren Hong, Ting Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) enables efficient adaptation of
+pre-trained language models (PLMs) to specific tasks. By tuning only a minimal
+set of (extra) parameters, PEFT achieves performance comparable to full
+fine-tuning. However, despite its prevalent use, the security implications of
+PEFT remain largely unexplored. In this paper, we conduct a pilot study
+revealing that PEFT exhibits unique vulnerability to trojan attacks.
+Specifically, we present PETA, a novel attack that accounts for downstream
+adaptation through bilevel optimization: the upper-level objective embeds the
+backdoor into a PLM while the lower-level objective simulates PEFT to retain
+the PLM's task-specific performance. With extensive evaluation across a variety
+of downstream tasks and trigger designs, we demonstrate PETA's effectiveness in
+terms of both attack success rate and unaffected clean accuracy, even after the
+victim user performs PEFT over the backdoored PLM using untainted data.
+Moreover, we empirically provide possible explanations for PETA's efficacy: the
+bilevel optimization inherently 'orthogonalizes' the backdoor and PEFT modules,
+thereby retaining the backdoor throughout PEFT. Based on this insight, we
+explore a simple defense that omits PEFT in selected layers of the backdoored
+PLM and unfreezes a subset of these layers' parameters, which is shown to
+effectively neutralize PETA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Autoencoders Find Highly Interpretable Features in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08600v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08600v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoagy Cunningham, Aidan Ewart, Logan Riggs, Robert Huben, Lee Sharkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the roadblocks to a better understanding of neural networks' internals
+is \textit{polysemanticity}, where neurons appear to activate in multiple,
+semantically distinct contexts. Polysemanticity prevents us from identifying
+concise, human-understandable explanations for what neural networks are doing
+internally. One hypothesised cause of polysemanticity is
+\textit{superposition}, where neural networks represent more features than they
+have neurons by assigning features to an overcomplete set of directions in
+activation space, rather than to individual neurons. Here, we attempt to
+identify those directions, using sparse autoencoders to reconstruct the
+internal activations of a language model. These autoencoders learn sets of
+sparsely activating features that are more interpretable and monosemantic than
+directions identified by alternative approaches, where interpretability is
+measured by automated methods. Moreover, we show that with our learned set of
+features, we can pinpoint the features that are causally responsible for
+counterfactual behaviour on the indirect object identification task
+\citep{wang2022interpretability} to a finer degree than previous
+decompositions. This work indicates that it is possible to resolve
+superposition in language models using a scalable, unsupervised method. Our
+method may serve as a foundation for future mechanistic interpretability work,
+which we hope will enable greater model transparency and steerability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 18 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Neural Factor Analysis for Disentangling Utterance-level
+  Speech Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08099v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08099v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiwei Lin, Chenhang He, Man-Wai Mak, Youzhi Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) speech models such as wav2vec and HuBERT have
+demonstrated state-of-the-art performance on automatic speech recognition (ASR)
+and proved to be extremely useful in low label-resource settings. However, the
+success of SSL models has yet to transfer to utterance-level tasks such as
+speaker, emotion, and language recognition, which still require supervised
+fine-tuning of the SSL models to obtain good performance. We argue that the
+problem is caused by the lack of disentangled representations and an
+utterance-level learning objective for these tasks. Inspired by how HuBERT uses
+clustering to discover hidden acoustic units, we formulate a factor analysis
+(FA) model that uses the discovered hidden acoustic units to align the SSL
+features. The underlying utterance-level representations are disentangled from
+the content of speech using probabilistic inference on the aligned features.
+Furthermore, the variational lower bound derived from the FA model provides an
+utterance-level objective, allowing error gradients to be backpropagated to the
+Transformer layers to learn highly discriminative acoustic units. When used in
+conjunction with HuBERT's masked prediction training, our models outperform the
+current best model, WavLM, on all utterance-level non-semantic tasks on the
+SUPERB benchmark with only 20% of labeled data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ L-Eval: Instituting Standardized Evaluation for Long Context Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11088v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11088v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxin An, Shansan Gong, Ming Zhong, Xingjian Zhao, Mukai Li, Jun Zhang, Lingpeng Kong, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been growing interest in extending the context length of
+large language models (LLMs), aiming to effectively process long inputs of one
+turn or conversations with more extensive histories. While proprietary models
+such as GPT-4 and Claude can largely preserve the reasoning ability in an
+extended context, open-source models are still progressing through the early
+stages of development. To bridge this gap, we propose L-Eval to institute a
+more standardized evaluation for long context language models (LCLMs)
+addressing two key aspects: dataset construction and evaluation metrics. On the
+one hand, we build a new evaluation suite containing 20 sub-tasks, 508 long
+documents, and over 2,000 human-labeled query-response pairs encompassing
+diverse question styles, domains, and input length (3k$\sim$200k tokens). On
+the other hand, we investigate the effectiveness in evalution metrics for
+LCLMs. Results show that popular n-gram matching metrics generally can not
+correlate well with human judgment, and thus we strongly advocate for
+length-instruction-enhanced (LIE) evaluation and employing LLM judges. We
+conducted a comprehensive study of 4 popular commercial LLMs and 12 open-source
+counterparts using the L-Eval benchmark. Our empirical findings offer useful
+insights into the study of LCLMs and lay the groundwork for the development of
+more principled evaluation of these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Technical report: Graph Neural Networks go Grammatical 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01590v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01590v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Piquenot, Aldo Moscatelli, Maxime Bérar, Pierre Héroux, Romain raveaux, Jean-Yves Ramel, Sébastien Adam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a framework for formally establishing a connection
+between a portion of an algebraic language and a Graph Neural Network (GNN).
+The framework leverages Context-Free Grammars (CFG) to organize algebraic
+operations into generative rules that can be translated into a GNN layer model.
+As CFGs derived directly from a language tend to contain redundancies in their
+rules and variables, we present a grammar reduction scheme. By applying this
+strategy, we define a CFG that conforms to the third-order Weisfeiler-Lehman
+(3-WL) test using MATLANG. From this 3-WL CFG, we derive a GNN model, named
+G$^2$N$^2$, which is provably 3-WL compliant. Through various experiments, we
+demonstrate the superior efficiency of G$^2$N$^2$ compared to other 3-WL GNNs
+across numerous downstream tasks. Specifically, one experiment highlights the
+benefits of grammar reduction within our framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ToRA: A Tool-Integrated Reasoning Agent for Mathematical Problem Solving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17452v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17452v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibin Gou, Zhihong Shao, Yeyun Gong, Yelong Shen, Yujiu Yang, Minlie Huang, Nan Duan, Weizhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have made significant progress in various language
+tasks, yet they still struggle with complex mathematics. In this paper, we
+propose ToRA a series of Tool-integrated Reasoning Agents designed to solve
+challenging mathematical problems by seamlessly integrating natural language
+reasoning with the utilization of external tools (e.g., computation libraries
+and symbolic solvers), thereby amalgamating the analytical prowess of language
+and the computational efficiency of tools. To train ToRA, we curate interactive
+tool-use trajectories on mathematical datasets, apply imitation learning on the
+annotations, and propose output space shaping to further refine models'
+reasoning behavior. As a result, ToRA models significantly outperform
+open-source models on 10 mathematical reasoning datasets across all scales with
+13%-19% absolute improvements on average. Notably, ToRA-7B reaches 44.6% on the
+competition-level dataset MATH, surpassing the best open-source model
+WizardMath-70B by 22% absolute. ToRA-Code-34B is also the first open-source
+model that achieves an accuracy exceeding 50% on MATH, which significantly
+outperforms GPT-4's CoT result, and is competitive with GPT-4 solving problems
+with programs. Additionally, we conduct a comprehensive analysis of the
+benefits and remaining challenges of tool interaction for mathematical
+reasoning, providing valuable insights for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors equal contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preserving Phonemic Distinctions for Ordinal Regression: A Novel Loss
+  Function for Automatic Pronunciation Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bi-Cheng Yan, Hsin-Wei Wang, Yi-Cheng Wang, Jiun-Ting Li, Chi-Han Lin, Berlin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic pronunciation assessment (APA) manages to quantify the
+pronunciation proficiency of a second language (L2) learner in a language.
+Prevailing approaches to APA normally leverage neural models trained with a
+regression loss function, such as the mean-squared error (MSE) loss, for
+proficiency level prediction. Despite most regression models can effectively
+capture the ordinality of proficiency levels in the feature space, they are
+confronted with a primary obstacle that different phoneme categories with the
+same proficiency level are inevitably forced to be close to each other,
+retaining less phoneme-discriminative information. On account of this, we
+devise a phonemic contrast ordinal (PCO) loss for training regression-based APA
+models, which aims to preserve better phonemic distinctions between phoneme
+categories meanwhile considering ordinal relationships of the regression target
+output. Specifically, we introduce a phoneme-distinct regularizer into the MSE
+loss, which encourages feature representations of different phoneme categories
+to be far apart while simultaneously pulling closer the representations
+belonging to the same phoneme category by means of weighted distances. An
+extensive set of experiments carried out on the speechocean762 benchmark
+dataset suggest the feasibility and effectiveness of our model in relation to
+some existing state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMatic: Neural Architecture Search via Large Language Models and
+  Quality Diversity Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01102v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01102v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad U. Nasir, Sam Earle, Julian Togelius, Steven James, Christopher Cleghorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as powerful tools capable of
+accomplishing a broad spectrum of tasks. Their abilities span numerous areas,
+and one area where they have made a significant impact is in the domain of code
+generation. In this context, we view LLMs as mutation and crossover tools.
+Meanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and
+robust solutions. By merging the code-generating abilities of LLMs with the
+diversity and robustness of QD solutions, we introduce LLMatic, a Neural
+Architecture Search (NAS) algorithm. While LLMs struggle to conduct NAS
+directly through prompts, LLMatic uses a procedural approach, leveraging QD for
+prompts and network architecture to create diverse and highly performant
+networks. We test LLMatic on the CIFAR-10 image classification benchmark,
+demonstrating that it can produce competitive networks with just $2,000$
+searches, even without prior knowledge of the benchmark domain or exposure to
+any previous top-performing models for the benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Entity-Deduction Arena: A playground for probing the conversational
+  reasoning and planning capabilities of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01468v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01468v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhe Zhang, Jiarui Lu, Navdeep Jaitly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are effective at answering questions that are
+clearly asked. However, when faced with ambiguous queries they can act
+unpredictably and produce incorrect outputs. This underscores the need for the
+development of intelligent agents capable of asking clarification questions to
+resolve ambiguities effectively. This capability requires complex
+understanding, state tracking, reasoning and planning over multiple
+conversational turns. However, directly measuring this can be challenging. In
+this paper, we offer a surrogate problem which assesses an LLMs's capability to
+deduce an entity unknown to itself, but revealed to a judge, by asking the
+judge a series of queries. This entity-deducing game can serve as an evaluation
+framework to probe the conversational reasoning and planning capabilities of
+language models. We systematically evaluate various LLMs and discover
+significant differences in their performance on this task. We find that strong
+LLMs like GPT-4 outperform human players by a large margin. We further employ
+Behavior Cloning (BC) to examine whether a weaker model is capable of imitating
+a stronger model and generalizing to data or domains, using only the
+demonstrations from a stronger model. We finally propose to use Reinforcement
+Learning to enhance reasoning and planning capacity of Vicuna models through
+episodes of game playing, which lead to significant performance improvement. We
+hope that this problem offers insights into how autonomous agents could be
+trained to behave more intelligently in ambiguous circumstances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Who's Harry Potter? Approximate Unlearning in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02238v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02238v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronen Eldan, Mark Russinovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are trained on massive internet corpora that
+often contain copyrighted content. This poses legal and ethical challenges for
+the developers and users of these models, as well as the original authors and
+publishers. In this paper, we propose a novel technique for unlearning a subset
+of the training data from a LLM, without having to retrain it from scratch.
+  We evaluate our technique on the task of unlearning the Harry Potter books
+from the Llama2-7b model (a generative language model recently open-sourced by
+Meta). While the model took over 184K GPU-hours to pretrain, we show that in
+about 1 GPU hour of finetuning, we effectively erase the model's ability to
+generate or recall Harry Potter-related content, while its performance on
+common benchmarks (such as Winogrande, Hellaswag, arc, boolq and piqa) remains
+almost unaffected. We make our fine-tuned model publicly available on
+HuggingFace for community evaluation. To the best of our knowledge, this is the
+first paper to present an effective technique for unlearning in generative
+language models.
+  Our technique consists of three main components: First, we use a reinforced
+model that is further trained on the target data to identify the tokens that
+are most related to the unlearning target, by comparing its logits with those
+of a baseline model. Second, we replace idiosyncratic expressions in the target
+data with generic counterparts, and leverage the model's own predictions to
+generate alternative labels for every token. These labels aim to approximate
+the next-token predictions of a model that has not been trained on the target
+data. Third, we finetune the model on these alternative labels, which
+effectively erases the original text from the model's memory whenever it is
+prompted with its context.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLASK: Fine-grained Language Model Evaluation based on Alignment Skill
+  Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghyeon Ye, Doyoung Kim, Sungdong Kim, Hyeonbin Hwang, Seungone Kim, Yongrae Jo, James Thorne, Juho Kim, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluation of Large Language Models (LLMs) is challenging because
+instruction-following necessitates alignment with human values and the required
+set of skills varies depending on the instruction. However, previous studies
+have mainly focused on coarse-grained evaluation (i.e. overall preference-based
+evaluation), which limits interpretability since it does not consider the
+nature of user instructions that require instance-wise skill composition. In
+this paper, we introduce FLASK (Fine-grained Language Model Evaluation based on
+Alignment Skill Sets), a fine-grained evaluation protocol for both human-based
+and model-based evaluation which decomposes coarse-level scoring to a skill
+set-level scoring for each instruction. We experimentally observe that the
+fine-graininess of evaluation is crucial for attaining a holistic view of model
+performance and increasing the reliability of the evaluation. Using FLASK, we
+compare multiple open-source and proprietary LLMs and observe a high
+correlation between model-based and human-based evaluations. We publicly
+release the evaluation data and code implementation at
+https://github.com/kaistAI/FLASK.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ocean<span class="highlight-title">GPT</span>: A Large Language Model for Ocean Science Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Bi, Ningyu Zhang, Yida Xue, Yixin Ou, Daxiong Ji, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocean science, which delves into the oceans that are reservoirs of life and
+biodiversity, is of great significance given that oceans cover over 70% of our
+planet's surface. Recently, advances in Large Language Models (LLMs) have
+transformed the paradigm in science. Despite the success in other domains,
+current LLMs often fall short in catering to the needs of domain experts like
+oceanographers, and the potential of LLMs for ocean science is under-explored.
+The intrinsic reason may be the immense and intricate nature of ocean data as
+well as the necessity for higher granularity and richness in knowledge. To
+alleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean
+domain, which is expert in various ocean science tasks. We propose DoInstruct,
+a novel framework to automatically obtain a large volume of ocean domain
+instruction data, which generates instructions based on multi-agent
+collaboration. Additionally, we construct the first oceanography benchmark,
+OceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though
+comprehensive experiments, OceanGPT not only shows a higher level of knowledge
+expertise for oceans science tasks but also gains preliminary embodied
+intelligence capabilities in ocean technology. Codes, data and checkpoints will
+soon be available at https://github.com/zjunlp/KnowLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Project Website:
+  https://zjunlp.github.io/project/OceanGPT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective and Parameter-Efficient Reusing Fine-Tuned Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01886v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01886v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisen Jiang, Baijiong Lin, Han Shi, Yu Zhang, Zhenguo Li, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many pre-trained large-scale models provided online have become highly
+effective in transferring to downstream tasks. At the same time, various
+task-specific models fine-tuned on these pre-trained models are available
+online for public use. In practice, as collecting task-specific data is
+labor-intensive and fine-tuning the large pre-trained models is computationally
+expensive, one can reuse task-specific finetuned models to deal with downstream
+tasks. However, using a model per task causes a heavy burden on storage and
+serving. Recently, many training-free and parameter-efficient methods have been
+proposed for reusing multiple fine-tuned task-specific models into a single
+multi-task model. However, these methods exhibit a large accuracy gap compared
+with using a fine-tuned model per task. In this paper, we propose
+Parameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing
+Fully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task
+vector into a merged model by magnitude pruning. For reusing LoRA fine-tuned
+models, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA
+matrix by singular value decomposition. Both PERUFFT and PERU-LoRA are
+training-free. Extensive experiments conducted on computer vision and natural
+language process tasks demonstrate the effectiveness and parameter-efficiency
+of the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform
+existing reusing model methods by a large margin and achieve comparable
+performance to using a fine-tuned model per task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain-of-Symbol <span class="highlight-title">Prompt</span>ing Elicits Planning in Large Langauge Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10276v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10276v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxu Hu, Hongyuan Lu, Huajian Zhang, Yun-Ze Song, Wai Lam, Yue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we take the initiative to investigate the performance of LLMs
+on complex planning tasks that require LLMs to understand a virtual spatial
+environment simulated via natural language and act correspondingly in text. We
+propose a benchmark named Natural Language Planning and Action (Natala)
+composed of a set of novel tasks: Brick World, NLVR-based Manipulations, and
+Natural Language Navigation. We found that current popular LLMs such as ChatGPT
+still lack abilities in complex planning. This arises a question -- do the LLMs
+have a good understanding of the environments described in natural language, or
+maybe other alternatives such as symbolic representations are neater and hence
+better to be understood by LLMs? To this end, we propose a novel method called
+CoS (Chain-of-Symbol Prompting) that represents the complex environments with
+condensed symbolic spatial representations during the chained intermediate
+thinking steps. CoS is easy to use and does not need additional training on
+LLMs. Extensive experiments indicate that CoS clearly surpasses the performance
+of the Chain-of-Thought (CoT) Prompting in all three planning tasks with even
+fewer tokens used in the inputs compared with CoT on ChatGPT and InstructGPT.
+The performance gain is strong, by up to 60.8% accuracy (from 31.8% to 92.6%)
+on Brick World for ChatGPT. CoS also reduces the number of tokens in the prompt
+obviously, by up to 65.8% of the tokens (from 407 to 139) for the intermediate
+steps from demonstrations on Brick World. Code and data available at:
+https://github.com/hanxuhu/chain-of-symbol-planning
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedCPT: Contrastive <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span>s with Large-scale PubMed
+  Search Logs for Zero-shot Biomedical Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Won Kim, Qingyu Chen, Donald C. Comeau, Lana Yeganova, W. John Wilbur, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information retrieval (IR) is essential in biomedical knowledge acquisition
+and clinical decision support. While recent progress has shown that language
+model encoders perform better semantic retrieval, training such models requires
+abundant query-article annotations that are difficult to obtain in biomedicine.
+As a result, most biomedical IR systems only conduct lexical matching. In
+response, we introduce MedCPT, a first-of-its-kind Contrastively Pre-trained
+Transformer model for zero-shot semantic IR in biomedicine. To train MedCPT, we
+collected an unprecedented scale of 255 million user click logs from PubMed.
+With such data, we use contrastive learning to train a pair of
+closely-integrated retriever and re-ranker. Experimental results show that
+MedCPT sets new state-of-the-art performance on six biomedical IR tasks,
+outperforming various baselines including much larger models such as
+GPT-3-sized cpt-text-XL. In addition, MedCPT also generates better biomedical
+article and sentence representations for semantic evaluations. As such, MedCPT
+can be readily applied to various real-world biomedical IR tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The MedCPT code and API are available at
+  https://github.com/ncbi/MedCPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalize Segment Anything Model with One Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renrui Zhang, Zhengkai Jiang, Ziyu Guo, Shilin Yan, Junting Pan, Xianzheng Ma, Hao Dong, Peng Gao, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by large-data pre-training, Segment Anything Model (SAM) has been
+demonstrated as a powerful and promptable framework, revolutionizing the
+segmentation models. Despite the generality, customizing SAM for specific
+visual concepts without man-powered prompting is under explored, e.g.,
+automatically segmenting your pet dog in different images. In this paper, we
+propose a training-free Personalization approach for SAM, termed as PerSAM.
+Given only a single image with a reference mask, PerSAM first localizes the
+target concept by a location prior, and segments it within other images or
+videos via three techniques: target-guided attention, target-semantic
+prompting, and cascaded post-refinement. In this way, we effectively adapt SAM
+for private use without any training. To further alleviate the mask ambiguity,
+we present an efficient one-shot fine-tuning variant, PerSAM-F. Freezing the
+entire SAM, we introduce two learnable weights for multi-scale masks, only
+training 2 parameters within 10 seconds for improved performance. To
+demonstrate our efficacy, we construct a new segmentation dataset, PerSeg, for
+personalized evaluation, and test our methods on video object segmentation with
+competitive performance. Besides, our approach can also enhance DreamBooth to
+personalize Stable Diffusion for text-to-image generation, which discards the
+background disturbance for better target appearance learning. Code is released
+at https://github.com/ZrrSkywalker/Personalize-SAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/ZrrSkywalker/Personalize-SAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Liu, Canwen Xu, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have greatly advanced code auto-completion
+systems, with a potential for substantial productivity enhancements for
+developers. However, current benchmarks mainly focus on single-file tasks,
+leaving an assessment gap for more complex, real-world, multi-file programming
+scenarios. To fill this gap, we introduce RepoBench, a new benchmark
+specifically designed for evaluating repository-level code auto-completion
+systems. RepoBench supports both Python and Java and consists of three
+interconnected evaluation tasks: RepoBench-R (Retrieval), RepoBench-C (Code
+Completion), and RepoBench-P (Pipeline). Each task respectively measures the
+system's ability to retrieve the most relevant code snippets from other files
+as cross-file context, predict the next line of code with cross-file and
+in-file context, and handle complex tasks that require a combination of both
+retrieval and next-line prediction. RepoBench aims to facilitate a more
+complete comparison of performance and encouraging continuous improvement in
+auto-completion systems. RepoBench is publicly available at
+https://github.com/Leolty/repobench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A comprehensive <span class="highlight-title">review</span> of automatic text summarization techniques:
+  method, data, evaluation and coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03403v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03403v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel O. Cajueiro, Arthur G. Nery, Igor Tavares, Maísa K. De Melo, Silvia A. dos Reis, Li Weigang, Victor R. R. Celestino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a literature review about Automatic Text Summarization (ATS)
+systems. We consider a citation-based approach. We start with some popular and
+well-known papers that we have in hand about each topic we want to cover and we
+have tracked the "backward citations" (papers that are cited by the set of
+papers we knew beforehand) and the "forward citations" (newer papers that cite
+the set of papers we knew beforehand). In order to organize the different
+methods, we present the diverse approaches to ATS guided by the mechanisms they
+use to generate a summary. Besides presenting the methods, we also present an
+extensive review of the datasets available for summarization tasks and the
+methods used to evaluate the quality of the summaries. Finally, we present an
+empirical exploration of these methods using the CNN Corpus dataset that
+provides golden summaries for extractive and abstractive methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Clipping: Differentially Private Deep Learning Made Easier and
+  Stronger <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07136v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07136v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqi Bu, Yu-Xiang Wang, Sheng Zha, George Karypis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Per-example gradient clipping is a key algorithmic step that enables
+practical differential private (DP) training for deep learning models. The
+choice of clipping threshold R, however, is vital for achieving high accuracy
+under DP. We propose an easy-to-use replacement, called automatic clipping,
+that eliminates the need to tune R for any DP optimizers, including DP-SGD,
+DP-Adam, DP-LAMB and many others. The automatic variants are as private and
+computationally efficient as existing DP optimizers, but require no DP-specific
+hyperparameters and thus make DP training as amenable as the standard
+non-private training. We give a rigorous convergence analysis of automatic
+DP-SGD in the non-convex setting, showing that it can enjoy an asymptotic
+convergence rate that matches the standard SGD, under a symmetric gradient
+noise assumption of the per-sample gradients (commonly used in the non-DP
+literature). We demonstrate on various language and vision tasks that automatic
+clipping outperforms or matches the state-of-the-art, and can be easily
+employed with minimal changes to existing codebases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedAlpaca -- An Open-Source Collection of Medical Conversational AI
+  Models and Training Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Han, Lisa C. Adams, Jens-Michalis Papaioannou, Paul Grundmann, Tom Oberhauser, Alexander Löser, Daniel Truhn, Keno K. Bressem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) like OpenAI's GPT series continue to make
+strides, we witness the emergence of artificial intelligence applications in an
+ever-expanding range of fields. In medicine, these LLMs hold considerable
+promise for improving medical workflows, diagnostics, patient care, and
+education. Yet, there is an urgent need for open-source models that can be
+deployed on-premises to safeguard patient privacy. In our work, we present an
+innovative dataset consisting of over 160,000 entries, specifically crafted to
+fine-tune LLMs for effective medical applications. We investigate the impact of
+fine-tuning these datasets on publicly accessible pre-trained LLMs, and
+subsequently, we juxtapose the performance of pre-trained-only models against
+the fine-tuned models concerning the examinations that future medical doctors
+must pass to achieve certification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SqueezeLLM: Dense-and-Sparse Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael W. Mahoney, Kurt Keutzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Large Language Models (LLMs) have demonstrated remarkable results
+for a wide range of tasks. However, deploying these models for inference has
+been a significant challenge due to their unprecedented resource requirements.
+This has forced existing deployment frameworks to use multi-GPU inference
+pipelines, which are often complex and costly, or to use smaller and less
+performant models. In this work, we demonstrate that the main bottleneck for
+generative inference with LLMs is memory bandwidth, rather than compute,
+specifically for single batch inference. While quantization has emerged as a
+promising solution by representing model weights with reduced precision,
+previous efforts have often resulted in notable performance degradation. To
+address this, we introduce SqueezeLLM, a post-training quantization framework
+that not only enables lossless compression to ultra-low precisions of up to
+3-bit, but also achieves higher quantization performance under the same memory
+constraint. Our framework incorporates two novel ideas: (i) sensitivity-based
+non-uniform quantization, which searches for the optimal bit precision
+assignment based on second-order information; and (ii) the Dense-and-Sparse
+decomposition that stores outliers and sensitive weight values in an efficient
+sparse format. When applied to the LLaMA models, our 3-bit quantization
+significantly reduces the perplexity gap from the FP16 baseline by up to 2.1x
+as compared to the state-of-the-art methods with the same memory requirement.
+Furthermore, when deployed on an A6000 GPU, our quantized models achieve up to
+2.3x speedup compared to the baseline. Our code is open-sourced and available
+online.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CrossGET: Cross-Guided Ensemble of Tokens for Accelerating
+  Vision-Language <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dachuan Shi, Chaofan Tao, Anyi Rao, Zhendong Yang, Chun Yuan, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision-language models have achieved tremendous progress far beyond
+what we ever expected. However, their computational costs are also dramatically
+growing with rapid development, especially for the large models. It makes model
+acceleration exceedingly critical in a scenario of limited resources. Although
+extensively studied for unimodal models, the acceleration for multimodal
+models, especially the vision-language Transformers, is relatively
+under-explored. To pursue more efficient and accessible vision-language
+Transformers, this paper introduces \textbf{Cross}-\textbf{G}uided
+\textbf{E}nsemble of \textbf{T}okens (\textbf{\emph{CrossGET}}), a universal
+acceleration framework for vision-language Transformers. This framework
+adaptively combines tokens through real-time, cross-modal guidance, thereby
+achieving substantial acceleration while keeping high performance.
+\textit{CrossGET} has two key innovations: 1) \textit{Cross-Guided Matching and
+Ensemble}. \textit{CrossGET} incorporates cross-modal guided token matching and
+ensemble to exploit cross-modal information effectively, only introducing
+cross-modal tokens with negligible extra parameters. 2) \textit{Complete-Graph
+Soft Matching}. In contrast to the existing bipartite soft matching approach,
+\textit{CrossGET} introduces a complete-graph soft matching policy to achieve
+more reliable token-matching results while maintaining parallelizability and
+high efficiency. Extensive experiments are conducted on various vision-language
+tasks, including image-text retrieval, visual reasoning, image captioning, and
+visual question answering. Performance on both classic multimodal architectures
+and emerging multimodal LLMs demonstrate the effectiveness and versatility of
+the proposed \textit{CrossGET} framework. The code will be at
+\url{https://github.com/sdc17/CrossGET}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Diffusion Energy-Based Model for Interpretable Text Modeling <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.05895v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.05895v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Yu, Sirui Xie, Xiaojian Ma, Baoxiong Jia, Bo Pang, Ruiqi Gao, Yixin Zhu, Song-Chun Zhu, Ying Nian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent space Energy-Based Models (EBMs), also known as energy-based priors,
+have drawn growing interests in generative modeling. Fueled by its flexibility
+in the formulation and strong modeling power of the latent space, recent works
+built upon it have made interesting attempts aiming at the interpretability of
+text modeling. However, latent space EBMs also inherit some flaws from EBMs in
+data space; the degenerate MCMC sampling quality in practice can lead to poor
+generation quality and instability in training, especially on data with complex
+latent structures. Inspired by the recent efforts that leverage diffusion
+recovery likelihood learning as a cure for the sampling issue, we introduce a
+novel symbiosis between the diffusion models and latent space EBMs in a
+variational learning framework, coined as the latent diffusion energy-based
+model. We develop a geometric clustering-based regularization jointly with the
+information bottleneck to further improve the quality of the learned latent
+space. Experiments on several challenging tasks demonstrate the superior
+performance of our model on interpretable text modeling over strong
+counterparts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two-stage LLM Fine-tuning with Less Specialization and More
+  Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihan Wang, Si Si, Daliang Li, Michal Lukasik, Felix Yu, Cho-Jui Hsieh, Inderjit S Dhillon, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained large language models (LLMs) are general purpose problem solvers
+applicable to a diverse set of tasks with prompts. They can be further improved
+towards a specific task by fine-tuning on a specialized dataset. However,
+fine-tuning usually makes the model narrowly specialized on this dataset with
+reduced general in-context learning performances, which is undesirable whenever
+the fine-tuned model needs to handle additional tasks where no fine-tuning data
+is available. In this work, we first demonstrate that fine-tuning on a single
+task indeed decreases LLMs' general in-context learning performance. We
+discover one important cause of such forgetting, format specialization, where
+the model overfits to the format of the fine-tuned task. We further show that
+format specialization happens at the very beginning of fine-tuning. To solve
+this problem, we propose Prompt Tuning with MOdel Tuning (ProMoT), a simple yet
+effective two-stage fine-tuning framework that reduces format specialization
+and improves generalization. ProMoT offloads task-specific format learning into
+additional and removable parameters by first doing prompt tuning and then
+fine-tuning the model itself with this soft prompt attached. With experiments
+on several fine-tuning tasks and 8 in-context evaluation tasks, we show that
+ProMoT achieves comparable performance on fine-tuned tasks to standard
+fine-tuning, but with much less loss of in-context learning performances across
+a board range of out-of-domain evaluation tasks. More importantly, ProMoT can
+even enhance generalization on in-context learning tasks that are semantically
+related to the fine-tuned task, e.g. ProMoT on En-Fr translation significantly
+improves performance on other language pairs, and ProMoT on NLI improves
+performance on summarization. Experiments also show that ProMoT can improve the
+generalization performance of multi-task training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NLPBench: Evaluating Large Language Models on Solving NLP Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15630v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15630v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linxin Song, Jieyu Zhang, Lechao Cheng, Pengyuan Zhou, Tianyi Zhou, Irene Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in large language models (LLMs) have shown promise in
+enhancing the capabilities of natural language processing (NLP). Despite these
+successes, there remains a dearth of research dedicated to the NLP
+problem-solving abilities of LLMs. To fill the gap in this area, we present a
+unique benchmarking dataset, NLPBench, comprising 378 college-level NLP
+questions spanning various NLP topics sourced from Yale University's prior
+final exams. NLPBench includes questions with context, in which multiple
+sub-questions share the same public information, and diverse question types,
+including multiple choice, short answer, and math. Our evaluation, centered on
+LLMs such as GPT-3.5/4, PaLM-2, and LLAMA-2, incorporates advanced prompting
+strategies like the chain-of-thought (CoT) and tree-of-thought (ToT). Our study
+reveals that the effectiveness of the advanced prompting strategies can be
+inconsistent, occasionally damaging LLM performance, especially in smaller
+models like the LLAMA-2 (13b). Furthermore, our manual assessment illuminated
+specific shortcomings in LLMs' scientific problem-solving skills, with
+weaknesses in logical decomposition and reasoning notably affecting results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text as Environment: A Deep Reinforcement Learning Text Readability
+  Assessment Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1912.05957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1912.05957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamid Mohammadi, Seyed Hossein Khasteh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating the readability of a text can significantly facilitate the precise
+expression of information in written form. The formulation of text readability
+assessment involves the identification of meaningful properties of the text
+regardless of its length. Sophisticated features and models are used to
+evaluate the comprehensibility of texts accurately. Despite this, the problem
+of assessing texts' readability efficiently remains relatively untouched. The
+efficiency of state-of-the-art text readability assessment models can be
+further improved using deep reinforcement learning models. Using a hard
+attention-based active inference technique, the proposed approach makes
+efficient use of input text and computational resources. Through the use of
+semi-supervised signals, the reinforcement learning model uses the minimum
+amount of text in order to determine text's readability. A comparison of the
+model on Weebit and Cambridge Exams with state-of-the-art models, such as the
+BERT text readability model, shows that it is capable of achieving
+state-of-the-art accuracy with a significantly smaller amount of input text
+than other models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Borges and AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01425v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01425v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Léon Bottou, Bernhard Schölkopf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many believe that Large Language Models (LLMs) open the era of Artificial
+Intelligence (AI). Some see opportunities while others see dangers. Yet both
+proponents and opponents grasp AI through the imagery popularised by science
+fiction. Will the machine become sentient and rebel against its creators? Will
+we experience a paperclip apocalypse? Before answering such questions, we
+should first ask whether this mental imagery provides a good description of the
+phenomenon at hand. Understanding weather patterns through the moods of the
+gods only goes so far. The present paper instead advocates understanding LLMs
+and their connection to AI through the imagery of Jorge Luis Borges, a master
+of 20th century literature, forerunner of magical realism, and precursor to
+postmodern literature. This exercise leads to a new perspective that
+illuminates the relation between language modelling and artificial
+intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">120</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LanguageMPC: Large Language Models as Decision Makers for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka, Wei Zhan, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based autonomous driving (AD) systems face challenges in
+comprehending high-level information, generalizing to rare events, and
+providing interpretability. To address these problems, this work employs Large
+Language Models (LLMs) as a decision-making component for complex AD scenarios
+that require human commonsense understanding. We devise cognitive pathways to
+enable comprehensive reasoning with LLMs, and develop algorithms for
+translating LLM decisions into actionable driving commands. Through this
+approach, LLM decisions are seamlessly integrated with low-level controllers by
+guided parameter matrix adaptation. Extensive experiments demonstrate that our
+proposed method not only consistently surpasses baseline approaches in
+single-vehicle tasks, but also helps handle complex driving behaviors even
+multi-vehicle coordination, thanks to the commonsense reasoning capabilities of
+LLMs. This paper presents an initial step toward leveraging LLMs as effective
+decision-makers for intricate AD scenarios in terms of safety, efficiency,
+generalizability, and interoperability. We aspire for it to serve as
+inspiration for future research in this field. Project page:
+https://sites.google.com/view/llm-mpc
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-oriented Representation Learning for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxiao Huo, Mingyu Ding, Chenfeng Xu, Thomas Tian, Xinghao Zhu, Yao Mu, Lingfeng Sun, Masayoshi Tomizuka, Wei Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans inherently possess generalizable visual representations that empower
+them to efficiently explore and interact with the environments in manipulation
+tasks. We advocate that such a representation automatically arises from
+simultaneously learning about multiple simple perceptual skills that are
+critical for everyday scenarios (e.g., hand detection, state estimate, etc.)
+and is better suited for learning robot manipulation policies compared to
+current state-of-the-art visual representations purely based on self-supervised
+objectives. We formalize this idea through the lens of human-oriented
+multi-task fine-tuning on top of pre-trained visual encoders, where each task
+is a perceptual skill tied to human-environment interactions. We introduce Task
+Fusion Decoder as a plug-and-play embedding translator that utilizes the
+underlying relationships among these perceptual skills to guide the
+representation learning towards encoding meaningful structure for what's
+important for all perceptual skills, ultimately empowering learning of
+downstream robotic manipulation tasks. Extensive experiments across a range of
+robotic tasks and embodiments, in both simulations and real-world environments,
+show that our Task Fusion Decoder consistently improves the representation of
+three state-of-the-art visual encoders including R3M, MVP, and EgoVLP, for
+downstream manipulation policy-learning. Project page:
+https://sites.google.com/view/human-oriented-robot-learning
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistent-1-to-3: Consistent Image to 3D View Synthesis via
+  Geometry-aware Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianglong Ye, Peng Wang, Kejie Li, Yichun Shi, Heng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot novel view synthesis (NVS) from a single image is an essential
+problem in 3D object understanding. While recent approaches that leverage
+pre-trained generative models can synthesize high-quality novel views from
+in-the-wild inputs, they still struggle to maintain 3D consistency across
+different views. In this paper, we present Consistent-1-to-3, which is a
+generative framework that significantly mitigate this issue. Specifically, we
+decompose the NVS task into two stages: (i) transforming observed regions to a
+novel view, and (ii) hallucinating unseen regions. We design a scene
+representation transformer and view-conditioned diffusion model for performing
+these two stages respectively. Inside the models, to enforce 3D consistency, we
+propose to employ epipolor-guided attention to incorporate geometry
+constraints, and multi-view attention to better aggregate multi-view
+information. Finally, we design a hierarchy generation paradigm to generate
+long sequences of consistent views, allowing a full 360 observation of the
+provided object image. Qualitative and quantitative evaluation over multiple
+datasets demonstrate the effectiveness of the proposed mechanisms against
+state-of-the-art approaches. Our project page is at
+https://jianglongye.com/consistent123/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://jianglongye.com/consistent123/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient-3DiM: Learning a Generalizable Single-image Novel-view
+  Synthesizer in One Day 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Jiang, Hao Tang, Jen-Hao Rick Chang, Liangchen Song, Zhangyang Wang, Liangliang Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of novel view synthesis aims to generate unseen perspectives of an
+object or scene from a limited set of input images. Nevertheless, synthesizing
+novel views from a single image still remains a significant challenge in the
+realm of computer vision. Previous approaches tackle this problem by adopting
+mesh prediction, multi-plain image construction, or more advanced techniques
+such as neural radiance fields. Recently, a pre-trained diffusion model that is
+specifically designed for 2D image synthesis has demonstrated its capability in
+producing photorealistic novel views, if sufficiently optimized on a 3D
+finetuning task. Although the fidelity and generalizability are greatly
+improved, training such a powerful diffusion model requires a vast volume of
+training data and model parameters, resulting in a notoriously long time and
+high computational costs. To tackle this issue, we propose Efficient-3DiM, a
+simple but effective framework to learn a single-image novel-view synthesizer.
+Motivated by our in-depth analysis of the inference process of diffusion
+models, we propose several pragmatic strategies to reduce the training overhead
+to a manageable scale, including a crafted timestep sampling strategy, a
+superior 3D feature extractor, and an enhanced training scheme. When combined,
+our framework is able to reduce the total training time from 10 days to less
+than 1 day, significantly accelerating the training process under the same
+computational platform (one instance with 8 Nvidia A100 GPUs). Comprehensive
+experiments are conducted to demonstrate the efficiency and generalizability of
+our proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Domain-Specific Features Disentanglement for Domain
+  Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Qi Zhang, Zenan Huang, Haobo Wang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributional shift between domains poses great challenges to modern machine
+learning algorithms. The domain generalization (DG) signifies a popular line
+targeting this issue, where these methods intend to uncover universal patterns
+across disparate distributions. Noted, the crucial challenge behind DG is the
+existence of irrelevant domain features, and most prior works overlook this
+information. Motivated by this, we propose a novel contrastive-based
+disentanglement method CDDG, to effectively utilize the disentangled features
+to exploit the over-looked domain-specific features, and thus facilitating the
+extraction of the desired cross-domain category features for DG tasks.
+Specifically, CDDG learns to decouple inherent mutually exclusive features by
+leveraging them in the latent space, thus making the learning discriminative.
+Extensive experiments conducted on various benchmark datasets demonstrate the
+superiority of our method compared to other state-of-the-art approaches.
+Furthermore, visualization evaluations confirm the potential of our method in
+achieving effective feature disentanglement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COOLer: Class-Incremental Learning for Appearance-Based Multiple Object
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhizheng Liu, Mattia Segu, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning allows a model to learn multiple tasks sequentially while
+retaining the old knowledge without the training data of the preceding tasks.
+This paper extends the scope of continual learning research to
+class-incremental learning for \ac{mot}, which is desirable to accommodate the
+continuously evolving needs of autonomous systems. Previous solutions for
+continual learning of object detectors do not address the data association
+stage of appearance-based trackers, leading to catastrophic forgetting of
+previous classes' re-identification features. We introduce COOLer, a
+COntrastive- and cOntinual-Learning-based tracker, which incrementally learns
+to track new categories while preserving past knowledge by training on a
+combination of currently available ground truth labels and pseudo-labels
+generated by the past tracker. To further exacerbate the disentanglement of
+instance representations, we introduce a novel contrastive class-incremental
+instance representation learning technique. Finally, we propose a practical
+evaluation protocol for continual learning for MOT and conduct experiments on
+the \bdd and \shift datasets. Experimental results demonstrate that COOLer
+continually learns while effectively addressing catastrophic forgetting of both
+tracking and detection. The code is available at
+\url{https://github.com/BoSmallEar/COOLer}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GCPR 2023 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reversing Deep Face Embeddings with Probable Privacy Protection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daile Osorio-Roig, Paul A. Gerlitz, Christian Rathgeb, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generally, privacy-enhancing face recognition systems are designed to offer
+permanent protection of face embeddings. Recently, so-called soft-biometric
+privacy-enhancement approaches have been introduced with the aim of canceling
+soft-biometric attributes. These methods limit the amount of soft-biometric
+information (gender or skin-colour) that can be inferred from face embeddings.
+Previous work has underlined the need for research into rigorous evaluations
+and standardised evaluation protocols when assessing privacy protection
+capabilities. Motivated by this fact, this paper explores to what extent the
+non-invertibility requirement can be met by methods that claim to provide
+soft-biometric privacy protection. Additionally, a detailed vulnerability
+assessment of state-of-the-art face embedding extractors is analysed in terms
+of the transformation complexity used for privacy protection. In this context,
+a well-known state-of-the-art face image reconstruction approach has been
+evaluated on protected face embeddings to break soft biometric privacy
+protection. Experimental results show that biometric privacy-enhanced face
+embeddings can be reconstructed with an accuracy of up to approximately 98%,
+depending on the complexity of the protection algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft Convex Quantization: Revisiting Vector Quantization with Convex
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Gautam, Reid Pryzant, Ziyi Yang, Chenguang Zhu, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vector Quantization (VQ) is a well-known technique in deep learning for
+extracting informative discrete latent representations. VQ-embedded models have
+shown impressive results in a range of applications including image and speech
+generation. VQ operates as a parametric K-means algorithm that quantizes inputs
+using a single codebook vector in the forward pass. While powerful, this
+technique faces practical challenges including codebook collapse,
+non-differentiability and lossy compression. To mitigate the aforementioned
+issues, we propose Soft Convex Quantization (SCQ) as a direct substitute for
+VQ. SCQ works like a differentiable convex optimization (DCO) layer: in the
+forward pass, we solve for the optimal convex combination of codebook vectors
+that quantize the inputs. In the backward pass, we leverage differentiability
+through the optimality conditions of the forward solution. We then introduce a
+scalable relaxation of the SCQ optimization and demonstrate its efficacy on the
+CIFAR-10, GTSRB and LSUN datasets. We train powerful SCQ autoencoder models
+that significantly outperform matched VQ-based architectures, observing an
+order of magnitude better image reconstruction and codebook usage with
+comparable quantization runtime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECoFLaP: Efficient Coarse-to-Fine Layer-Wise Pruning for Vision-Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Sung, Jaehong Yoon, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) can understand the world comprehensively
+by integrating rich information from different modalities, achieving remarkable
+performance improvements on various multimodal downstream tasks. However,
+deploying LVLMs is often problematic due to their massive computational/energy
+costs and carbon consumption. Such issues make it infeasible to adopt
+conventional iterative global pruning, which is costly due to computing the
+Hessian matrix of the entire large model for sparsification. Alternatively,
+several studies have recently proposed layer-wise pruning approaches to avoid
+the expensive computation of global pruning and efficiently compress model
+weights according to their importance within a layer. However, these methods
+often suffer from suboptimal model compression due to their lack of a global
+perspective. To address this limitation in recent efficient pruning methods for
+large models, we propose Efficient Coarse-to-Fine Layer-Wise Pruning (ECoFLaP),
+a two-stage coarse-to-fine weight pruning approach for LVLMs. We first
+determine the sparsity ratios of different layers or blocks by leveraging the
+global importance score, which is efficiently computed based on the
+zeroth-order approximation of the global model gradients. Then, the multimodal
+model performs local layer-wise unstructured weight pruning based on
+globally-informed sparsity ratios. We validate our proposed method across
+various multimodal and unimodal models and datasets, demonstrating significant
+performance improvements over prevalent pruning techniques in the high-sparsity
+regime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ecoflap.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Key-Selection for Face-based One-Time Biometrics via Morphing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daile Osorio-Roig, Mahdi Ghafourian, Christian Rathgeb, Ruben Vera-Rodriguez, Christoph Busch, Julian Fierrez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, facial recognition systems are still vulnerable to adversarial
+attacks. These attacks vary from simple perturbations of the input image to
+modifying the parameters of the recognition model to impersonate an authorised
+subject. So-called privacy-enhancing facial recognition systems have been
+mostly developed to provide protection of stored biometric reference data, i.e.
+templates. In the literature, privacy-enhancing facial recognition approaches
+have focused solely on conventional security threats at the template level,
+ignoring the growing concern related to adversarial attacks. Up to now, few
+works have provided mechanisms to protect face recognition against adversarial
+attacks while maintaining high security at the template level. In this paper,
+we propose different key selection strategies to improve the security of a
+competitive cancelable scheme operating at the signal level. Experimental
+results show that certain strategies based on signal-level key selection can
+lead to complete blocking of the adversarial attack based on an iterative
+optimization for the most secure threshold, while for the most practical
+threshold, the attack success chance can be decreased to approximately 5.0%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kosmos-G: Generating Images in Context with Multimodal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xichen Pan, Li Dong, Shaohan Huang, Zhiliang Peng, Wenhu Chen, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in text-to-image (T2I) and vision-language-to-image
+(VL2I) generation have made significant strides. However, the generation from
+generalized vision-language inputs, especially involving multiple images,
+remains under-explored. This paper presents Kosmos-G, a model that leverages
+the advanced perception capabilities of Multimodal Large Language Models
+(MLLMs) to tackle the aforementioned challenge. Our approach aligns the output
+space of MLLM with CLIP using the textual modality as an anchor and performs
+compositional instruction tuning on curated data. Kosmos-G demonstrates a
+unique capability of zero-shot multi-entity subject-driven generation. Notably,
+the score distillation instruction tuning requires no modifications to the
+image decoder. This allows for a seamless substitution of CLIP and effortless
+integration with a myriad of U-Net techniques ranging from fine-grained
+controls to personalized image decoder variants. We posit Kosmos-G as an
+initial attempt towards the goal of "image as a foreign language in image
+generation."
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://aka.ms/Kosmos-G Project Page:
+  https://xichenpan.github.io/kosmosg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probing Intersectional Biases in Vision-Language Models with
+  Counterfactual Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phillip Howard, Avinash Madasu, Tiep Le, Gustavo Lujan Moreno, Vasudev Lal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While vision-language models (VLMs) have achieved remarkable performance
+improvements recently, there is growing evidence that these models also posses
+harmful biases with respect to social attributes such as gender and race. Prior
+studies have primarily focused on probing such bias attributes individually
+while ignoring biases associated with intersections between social attributes.
+This could be due to the difficulty of collecting an exhaustive set of
+image-text pairs for various combinations of social attributes from existing
+datasets. To address this challenge, we employ text-to-image diffusion models
+to produce counterfactual examples for probing intserctional social biases at
+scale. Our approach utilizes Stable Diffusion with cross attention control to
+produce sets of counterfactual image-text pairs that are highly similar in
+their depiction of a subject (e.g., a given occupation) while differing only in
+their depiction of intersectional social attributes (e.g., race & gender). We
+conduct extensive experiments using our generated dataset which reveal the
+intersectional social biases present in state-of-the-art VLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze He, Yushi Bai, Matthieu Lin, Wang Zhao, Yubin Hu, Jenny Sheng, Ran Yi, Juanzi Li, Yong-Jin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent methods in text-to-3D leverage powerful pretrained diffusion models to
+optimize NeRF. Notably, these methods are able to produce high-quality 3D
+scenes without training on 3D data. Due to the open-ended nature of the task,
+most studies evaluate their results with subjective case studies and user
+experiments, thereby presenting a challenge in quantitatively addressing the
+question: How has current progress in Text-to-3D gone so far? In this paper, we
+introduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing
+diverse text prompts of three increasing complexity levels that are specially
+designed for 3D generation. To assess both the subjective quality and the text
+alignment, we propose two automatic metrics based on multi-view images produced
+by the 3D contents. The quality metric combines multi-view text-image scores
+and regional convolution to detect quality and view inconsistency. The
+alignment metric uses multi-view captioning and Large Language Model (LLM)
+evaluation to measure text-3D consistency. Both metrics closely correlate with
+different dimensions of human judgments, providing a paradigm for efficiently
+evaluating text-to-3D models. The benchmarking results, shown in Fig. 1, reveal
+performance differences among six prevalent text-to-3D methods. Our analysis
+further highlights the common struggles for current methods on generating
+surroundings and multi-object scenes, as well as the bottleneck of leveraging
+2D guidance for 3D generation. Our project page is available at:
+https://t3bench.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fully Automatic Segmentation of Gross Target Volume and Organs-at-Risk
+  for Radiotherapy Planning of Nasopharyngeal Carcinoma <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Astaraki, Simone Bendazzoli, Iuliana Toma-Dasu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target segmentation in CT images of Head&Neck (H&N) region is challenging due
+to low contrast between adjacent soft tissue. The SegRap 2023 challenge has
+been focused on benchmarking the segmentation algorithms of Nasopharyngeal
+Carcinoma (NPC) which would be employed as auto-contouring tools for radiation
+treatment planning purposes. We propose a fully-automatic framework and develop
+two models for a) segmentation of 45 Organs at Risk (OARs) and b) two Gross
+Tumor Volumes (GTVs). To this end, we preprocess the image volumes by
+harmonizing the intensity distributions and then automatically cropping the
+volumes around the target regions. The preprocessed volumes were employed to
+train a standard 3D U-Net model for each task, separately. Our method took
+second place for each of the tasks in the validation phase of the challenge.
+The proposed framework is available at https://github.com/Astarakee/segrap2023
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 3 tables, MICCAI SegRap challenge contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoDA: Collaborative Novel Box Discovery and Cross-modal Alignment for
+  Open-vocabulary 3D Object Detection <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Cao, Yihan Zeng, Hang Xu, Dan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary 3D Object Detection (OV-3DDet) aims to detect objects from an
+arbitrary list of categories within a 3D scene, which remains seldom explored
+in the literature. There are primarily two fundamental problems in OV-3DDet,
+i.e., localizing and classifying novel objects. This paper aims at addressing
+the two problems simultaneously via a unified framework, under the condition of
+limited base categories. To localize novel 3D objects, we propose an effective
+3D Novel Object Discovery strategy, which utilizes both the 3D box geometry
+priors and 2D semantic open-vocabulary priors to generate pseudo box labels of
+the novel objects. To classify novel object boxes, we further develop a
+cross-modal alignment module based on discovered novel boxes, to align feature
+spaces between 3D point cloud and image/text modalities. Specifically, the
+alignment process contains a class-agnostic and a class-discriminative
+alignment, incorporating not only the base objects with annotations but also
+the increasingly discovered novel objects, resulting in an iteratively enhanced
+alignment. The novel box discovery and crossmodal alignment are jointly learned
+to collaboratively benefit each other. The novel object discovery can directly
+impact the cross-modal alignment, while a better feature alignment can, in
+turn, boost the localization capability, leading to a unified OV-3DDet
+framework, named CoDA, for simultaneous novel object localization and
+classification. Extensive experiments on two challenging datasets (i.e.,
+SUN-RGBD and ScanNet) demonstrate the effectiveness of our method and also show
+a significant mAP improvement upon the best-performing alternative method by
+80%. Codes and pre-trained models are released on the project page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023. Project Page:
+  https://yangcaoai.github.io/publications/CoDA.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Landmark Color for AUV Docking in Visually Dynamic Environments <span class="chip">ICRA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Corey Knutson, Zhipeng Cao, Junaed Sattar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous Underwater Vehicles (AUVs) conduct missions underwater without the
+need for human intervention. A docking station (DS) can extend mission times of
+an AUV by providing a location for the AUV to recharge its batteries and
+receive updated mission information. Various methods for locating and tracking
+a DS exist, but most rely on expensive acoustic sensors, or are vision-based,
+which is significantly affected by water quality. In this \doctype, we present
+a vision-based method that utilizes adaptive color LED markers and dynamic
+color filtering to maximize landmark visibility in varying water conditions.
+Both AUV and DS utilize cameras to determine the water background color in
+order to calculate the desired marker color. No communication between AUV and
+DS is needed to determine marker color. Experiments conducted in a pool and
+lake show our method performs 10 times better than static color thresholding
+methods as background color varies. DS detection is possible at a range of 5
+meters in clear water with minimal false positives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICRA 2024 for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph data modelling for outcome prediction in oropharyngeal cancer
+  patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nithya Bhasker, Stefan Leger, Alexander Zwanenburg, Chethan Babu Reddy, Sebastian Bodenstedt, Steffen Löck, Stefanie Speidel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are becoming increasingly popular in the medical
+domain for the tasks of disease classification and outcome prediction. Since
+patient data is not readily available as a graph, most existing methods either
+manually define a patient graph, or learn a latent graph based on pairwise
+similarities between the patients. There are also hypergraph neural network
+(HGNN)-based methods that were introduced recently to exploit potential higher
+order associations between the patients by representing them as a hypergraph.
+In this work, we propose a patient hypergraph network (PHGN), which has been
+investigated in an inductive learning setup for binary outcome prediction in
+oropharyngeal cancer (OPC) patients using computed tomography (CT)-based
+radiomic features for the first time. Additionally, the proposed model was
+extended to perform time-to-event analyses, and compared with GNN and baseline
+linear models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Dermatoscopic Lesion Segmentation via Diffusion Models with
+  Visual and Textual <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyi Du, Xiaosong Wang, Yongyi Lu, Yuyin Zhou, Shaoting Zhang, Alan Yuille, Kang Li, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image synthesis approaches, e.g., generative adversarial networks, have been
+popular as a form of data augmentation in medical image analysis tasks. It is
+primarily beneficial to overcome the shortage of publicly accessible data and
+associated quality annotations. However, the current techniques often lack
+control over the detailed contents in generated images, e.g., the type of
+disease patterns, the location of lesions, and attributes of the diagnosis. In
+this work, we adapt the latest advance in the generative model, i.e., the
+diffusion model, with the added control flow using lesion-specific visual and
+textual prompts for generating dermatoscopic images. We further demonstrate the
+advantage of our diffusion model-based framework over the classical generation
+models in both the image quality and boosting the segmentation performance on
+skin lesions. It can achieve a 9% increase in the SSIM image quality measure
+and an over 5% increase in Dice coefficients over the prior arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computationally Efficient Quadratic Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathew Mithra Noel, Venkataraman Muthiah-Nakarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Higher order artificial neurons whose outputs are computed by applying an
+activation function to a higher order multinomial function of the inputs have
+been considered in the past, but did not gain acceptance due to the extra
+parameters and computational cost. However, higher order neurons have
+significantly greater learning capabilities since the decision boundaries of
+higher order neurons can be complex surfaces instead of just hyperplanes. The
+boundary of a single quadratic neuron can be a general hyper-quadric surface
+allowing it to learn many nonlinearly separable datasets. Since quadratic forms
+can be represented by symmetric matrices, only $\frac{n(n+1)}{2}$ additional
+parameters are needed instead of $n^2$. A quadratic Logistic regression model
+is first presented. Solutions to the XOR problem with a single quadratic neuron
+are considered. The complete vectorized equations for both forward and backward
+propagation in feedforward networks composed of quadratic neurons are derived.
+A reduced parameter quadratic neural network model with just $ n $ additional
+parameters per neuron that provides a compromise between learning ability and
+computational cost is presented. Comparison on benchmark classification
+datasets are used to demonstrate that a final layer of quadratic neurons
+enables networks to achieve higher accuracy with significantly fewer hidden
+layer neurons. In particular this paper shows that any dataset composed of $C$
+bounded clusters can be separated with only a single layer of $C$ quadratic
+neurons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-centric Behavior Description in Videos: New Benchmark and Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingru Zhou, Yiqi Gao, Manqing Zhang, Peng Wu, Peng Wang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of video surveillance, describing the behavior of each
+individual within the video is becoming increasingly essential, especially in
+complex scenarios with multiple individuals present. This is because describing
+each individual's behavior provides more detailed situational analysis,
+enabling accurate assessment and response to potential risks, ensuring the
+safety and harmony of public places. Currently, video-level captioning datasets
+cannot provide fine-grained descriptions for each individual's specific
+behavior. However, mere descriptions at the video-level fail to provide an
+in-depth interpretation of individual behaviors, making it challenging to
+accurately determine the specific identity of each individual. To address this
+challenge, we construct a human-centric video surveillance captioning dataset,
+which provides detailed descriptions of the dynamic behaviors of 7,820
+individuals. Specifically, we have labeled several aspects of each person, such
+as location, clothing, and interactions with other elements in the scene, and
+these people are distributed across 1,012 videos. Based on this dataset, we can
+link individuals to their respective behaviors, allowing for further analysis
+of each person's behavior in surveillance videos. Besides the dataset, we
+propose a novel video captioning approach that can describe individual behavior
+in detail on a person-level basis, achieving state-of-the-art results. To
+facilitate further research in this field, we intend to release our dataset and
+code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Grammatical Compositional Model for Video Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijun Zhang, Xu Zou, Jiahuan Zhou, Sheng Zhong, Ying Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analysis of human actions in videos demands understanding complex human
+dynamics, as well as the interaction between actors and context. However, these
+interaction relationships usually exhibit large intra-class variations from
+diverse human poses or object manipulations, and fine-grained inter-class
+differences between similar actions. Thus the performance of existing methods
+is severely limited. Motivated by the observation that interactive actions can
+be decomposed into actor dynamics and participating objects or humans, we
+propose to investigate the composite property of them. In this paper, we
+present a novel Grammatical Compositional Model (GCM) for action detection
+based on typical And-Or graphs. Our model exploits the intrinsic structures and
+latent relationships of actions in a hierarchical manner to harness both the
+compositionality of grammar models and the capability of expressing rich
+features of DNNs. The proposed model can be readily embodied into a neural
+network module for efficient optimization in an end-to-end manner. Extensive
+experiments are conducted on the AVA dataset and the Something-Else task to
+demonstrate the superiority of our model, meanwhile the interpretability is
+enhanced through an inference parsing procedure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Resolution Fusion for Fully Automatic Cephalometric Landmark
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongqian Guo, Wencheng Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cephalometric landmark detection on lateral skull X-ray images plays a
+crucial role in the diagnosis of certain dental diseases. Accurate and
+effective identification of these landmarks presents a significant challenge.
+Based on extensive data observations and quantitative analyses, we discovered
+that visual features from different receptive fields affect the detection
+accuracy of various landmarks differently. As a result, we employed an image
+pyramid structure, integrating multiple resolutions as input to train a series
+of models with different receptive fields, aiming to achieve the optimal
+feature combination for each landmark. Moreover, we applied several data
+augmentation techniques during training to enhance the model's robustness
+across various devices and measurement alternatives. We implemented this method
+in the Cephalometric Landmark Detection in Lateral X-ray Images 2023 Challenge
+and achieved a Mean Radial Error (MRE) of 1.62 mm and a Success Detection Rate
+(SDR) 2.0mm of 74.18% in the final testing phase.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Magicremover: Tuning-free Text-guided Image inpainting with Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Yang, Lu Zhang, Liqian Ma, Yu Liu, JingJing Fu, You He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image inpainting aims to fill in the missing pixels with visually coherent
+and semantically plausible content. Despite the great progress brought from
+deep generative models, this task still suffers from i. the difficulties in
+large-scale realistic data collection and costly model training; and ii. the
+intrinsic limitations in the traditionally user-defined binary masks on objects
+with unclear boundaries or transparent texture. In this paper, we propose
+MagicRemover, a tuning-free method that leverages the powerful diffusion models
+for text-guided image inpainting. We introduce an attention guidance strategy
+to constrain the sampling process of diffusion models, enabling the erasing of
+instructed areas and the restoration of occluded content. We further propose a
+classifier optimization algorithm to facilitate the denoising stability within
+less sampling steps. Extensive comparisons are conducted among our MagicRemover
+and state-of-the-art methods including quantitative evaluation and user study,
+demonstrating the significant improvement of MagicRemover on high-quality image
+inpainting. We will release our code at https://github.com/exisas/Magicremover.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Delving into CLIP latent space for Video Anomaly Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Zanella, Benedetta Liberatori, Willi Menapace, Fabio Poiesi, Yiming Wang, Elisa Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We tackle the complex problem of detecting and recognising anomalies in
+surveillance videos at the frame level, utilising only video-level supervision.
+We introduce the novel method AnomalyCLIP, the first to combine Large Language
+and Vision (LLV) models, such as CLIP, with multiple instance learning for
+joint video anomaly detection and classification. Our approach specifically
+involves manipulating the latent CLIP feature space to identify the normal
+event subspace, which in turn allows us to effectively learn text-driven
+directions for abnormal events. When anomalous frames are projected onto these
+directions, they exhibit a large feature magnitude if they belong to a
+particular class. We also introduce a computationally efficient Transformer
+architecture to model short- and long-term temporal dependencies between
+frames, ultimately producing the final anomaly score and class prediction
+probabilities. We compare AnomalyCLIP against state-of-the-art methods
+considering three major anomaly detection benchmarks, i.e. ShanghaiTech,
+UCF-Crime, and XD-Violence, and empirically show that it outperforms baselines
+in recognising video anomalies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to Computer Vision and Image Understanding, project website
+  and code are available at https://luca-zanella-dvl.github.io/AnomalyCLIP/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All Sizes Matter: Improving Volumetric Brain Segmentation on Small
+  Lesions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayhan Can Erdur, Daniel Scholz, Josef A. Buchner, Stephanie E. Combs, Daniel Rueckert, Jan C. Peeken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain metastases (BMs) are the most frequently occurring brain tumors. The
+treatment of patients having multiple BMs with stereo tactic radiosurgery
+necessitates accurate localization of the metastases. Neural networks can
+assist in this time-consuming and costly task that is typically performed by
+human experts. Particularly challenging is the detection of small lesions since
+they are often underrepresented in exist ing approaches. Yet, lesion detection
+is equally important for all sizes. In this work, we develop an ensemble of
+neural networks explicitly fo cused on detecting and segmenting small BMs. To
+accomplish this task, we trained several neural networks focusing on individual
+aspects of the BM segmentation problem: We use blob loss that specifically
+addresses the imbalance of lesion instances in terms of size and texture and
+is, therefore, not biased towards larger lesions. In addition, a model using a
+subtraction sequence between the T1 and T1 contrast-enhanced sequence focuses
+on low-contrast lesions. Furthermore, we train additional models only on small
+lesions. Our experiments demonstrate the utility of the ad ditional blob loss
+and the subtraction sequence. However, including the specialized small lesion
+models in the ensemble deteriorates segmentation results. We also find
+domain-knowledge-inspired postprocessing steps to drastically increase our
+performance in most experiments. Our approach enables us to submit a
+competitive challenge entry to the ASNR-MICCAI BraTS Brain Metastasis Challenge
+2023.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Vision Anomaly Detection with the Guidance of Language
+  Modality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Chen, Kaihang Pan, Guoming Wang, Yueting Zhuang, Siliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have seen a surge of interest in anomaly detection for tackling
+industrial defect detection, event detection, etc. However, existing
+unsupervised anomaly detectors, particularly those for the vision modality,
+face significant challenges due to redundant information and sparse latent
+space. Conversely, the language modality performs well due to its relatively
+single data. This paper tackles the aforementioned challenges for vision
+modality from a multimodal point of view. Specifically, we propose Cross-modal
+Guidance (CMG), which consists of Cross-modal Entropy Reduction (CMER) and
+Cross-modal Linear Embedding (CMLE), to tackle the redundant information issue
+and sparse space issue, respectively. CMER masks parts of the raw image and
+computes the matching score with the text. Then, CMER discards irrelevant
+pixels to make the detector focus on critical contents. To learn a more compact
+latent space for the vision anomaly detector, CMLE learns a correlation
+structure matrix from the language modality, and then the latent space of
+vision modality will be learned with the guidance of the matrix. Thereafter,
+the vision latent space will get semantically similar images closer. Extensive
+experiments demonstrate the effectiveness of the proposed methods.
+Particularly, CMG outperforms the baseline that only uses images by 16.81%.
+Ablation experiments further confirm the synergy among the proposed methods, as
+each component depends on the other to achieve optimal performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoBEV: Elevating Roadside 3D Object Detection with Depth and Height
+  Complementarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Shi, Chengshan Pang, Jiaming Zhang, Kailun Yang, Yuhao Wu, Huajian Ni, Yining Lin, Rainer Stiefelhagen, Kaiwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Roadside camera-driven 3D object detection is a crucial task in intelligent
+transportation systems, which extends the perception range beyond the
+limitations of vision-centric vehicles and enhances road safety. While previous
+studies have limitations in using only depth or height information, we find
+both depth and height matter and they are in fact complementary. The depth
+feature encompasses precise geometric cues, whereas the height feature is
+primarily focused on distinguishing between various categories of height
+intervals, essentially providing semantic context. This insight motivates the
+development of Complementary-BEV (CoBEV), a novel end-to-end monocular 3D
+object detection framework that integrates depth and height to construct robust
+BEV representations. In essence, CoBEV estimates each pixel's depth and height
+distribution and lifts the camera features into 3D space for lateral fusion
+using the newly proposed two-stage complementary feature selection (CFS)
+module. A BEV feature distillation framework is also seamlessly integrated to
+further enhance the detection accuracy from the prior knowledge of the
+fusion-modal CoBEV teacher. We conduct extensive experiments on the public 3D
+detection benchmarks of roadside camera-based DAIR-V2X-I and Rope3D, as well as
+the private Supremind-Road dataset, demonstrating that CoBEV not only achieves
+the accuracy of the new state-of-the-art, but also significantly advances the
+robustness of previous methods in challenging long-distance scenarios and noisy
+camera disturbance, and enhances generalization by a large margin in
+heterologous settings with drastic changes in scene and camera parameters. For
+the first time, the vehicle AP score of a camera model reaches 80% on
+DAIR-V2X-I in terms of easy mode. The source code will be made publicly
+available at https://github.com/MasterHow/CoBEV.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code will be made publicly available at
+  https://github.com/MasterHow/CoBEV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOMINO: A Dual-System for Multi-step Visual Language Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peifang Wang, Olga Golovneva, Armen Aghajanyan, Xiang Ren, Muhao Chen, Asli Celikyilmaz, Maryam Fazel-Zarandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual language reasoning requires a system to extract text or numbers from
+information-dense images like charts or plots and perform logical or arithmetic
+reasoning to arrive at an answer. To tackle this task, existing work relies on
+either (1) an end-to-end vision-language model trained on a large amount of
+data, or (2) a two-stage pipeline where a captioning model converts the image
+into text that is further read by another large language model to deduce the
+answer. However, the former approach forces the model to answer a complex
+question with one single step, and the latter approach is prone to inaccurate
+or distracting information in the converted text that can confuse the language
+model. In this work, we propose a dual-system for multi-step multimodal
+reasoning, which consists of a "System-1" step for visual information
+extraction and a "System-2" step for deliberate reasoning. Given an input,
+System-2 breaks down the question into atomic sub-steps, each guiding System-1
+to extract the information required for reasoning from the image. Experiments
+on chart and plot datasets show that our method with a pre-trained System-2
+module performs competitively compared to prior work on in- and
+out-of-distribution data. By fine-tuning the System-2 module (LLaMA-2 70B) on
+only a small amount of data on multi-step reasoning, the accuracy of our method
+is further improved and surpasses the best fully-supervised end-to-end approach
+by 5.7% and a pipeline approach with FlanPaLM (540B) by 7.5% on a challenging
+dataset with human-authored questions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking Anything in Heart All at Once 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengkang Shen, Hao Zhu, You Zhou, Yu Liu, Si Yi, Lili Dong, Weipeng Zhao, David J. Brady, Xun Cao, Zhan Ma, Yi Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Myocardial motion tracking stands as an essential clinical tool in the
+prevention and detection of Cardiovascular Diseases (CVDs), the foremost cause
+of death globally. However, current techniques suffer incomplete and inaccurate
+motion estimation of the myocardium both in spatial and temporal dimensions,
+hindering the early identification of myocardial dysfunction. In addressing
+these challenges, this paper introduces the Neural Cardiac Motion Field
+(NeuralCMF). NeuralCMF leverages the implicit neural representation (INR) to
+model the 3D structure and the comprehensive 6D forward/backward motion of the
+heart. This approach offers memory-efficient storage and continuous capability
+to query the precise shape and motion of the myocardium throughout the cardiac
+cycle at any specific point. Notably, NeuralCMF operates without the need for
+paired datasets, and its optimization is self-supervised through the physics
+knowledge priors both in space and time dimensions, ensuring compatibility with
+both 2D and 3D echocardiogram video inputs. Experimental validations across
+three representative datasets support the robustness and innovative nature of
+the NeuralCMF, marking significant advantages over existing state-of-the-arts
+in cardiac imaging and motion tracking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LROC-PANGU-GAN: Closing the Simulation Gap in Learning Crater
+  Segmentation with Planetary Simulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaewon La, Jaime Phadke, Matt Hutton, Marius Schwinning, Gabriele De Canio, Florian Renk, Lars Kunze, Matthew Gadd
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is critical for probes landing on foreign planetary bodies to be able to
+robustly identify and avoid hazards - as, for example, steep cliffs or deep
+craters can pose significant risks to a probe's landing and operational
+success. Recent applications of deep learning to this problem show promising
+results. These models are, however, often learned with explicit supervision
+over annotated datasets. These human-labelled crater databases, such as from
+the Lunar Reconnaissance Orbiter Camera (LROC), may lack in consistency and
+quality, undermining model performance - as incomplete and/or inaccurate labels
+introduce noise into the supervisory signal, which encourages the model to
+learn incorrect associations and results in the model making unreliable
+predictions. Physics-based simulators, such as the Planet and Asteroid Natural
+Scene Generation Utility, have, in contrast, perfect ground truth, as the
+internal state that they use to render scenes is known with exactness. However,
+they introduce a serious simulation-to-real domain gap - because of fundamental
+differences between the simulated environment and the real-world arising from
+modelling assumptions, unaccounted for physical interactions, environmental
+variability, etc. Therefore, models trained on their outputs suffer when
+deployed in the face of realism they have not encountered in their training
+data distributions. In this paper, we therefore introduce a system to close
+this "realism" gap while retaining label fidelity. We train a CycleGAN model to
+synthesise LROC from Planet and Asteroid Natural Scene Generation Utility
+(PANGU) images. We show that these improve the training of a downstream crater
+segmentation network, with segmentation performance on a test set of real LROC
+images improved as compared to using only simulated PANGU images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17th Symposium on Advanced Space Technologies in Robotics and
+  Automation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Shuffle: An Efficient Channel Mixture Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijun Gong, Zhuowen Yin, Yushu Li, Kailing Guo, Xiangmin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The redundancy of Convolutional neural networks not only depends on weights
+but also depends on inputs. Shuffling is an efficient operation for mixing
+channel information but the shuffle order is usually pre-defined. To reduce the
+data-dependent redundancy, we devise a dynamic shuffle module to generate
+data-dependent permutation matrices for shuffling. Since the dimension of
+permutation matrix is proportional to the square of the number of input
+channels, to make the generation process efficiently, we divide the channels
+into groups and generate two shared small permutation matrices for each group,
+and utilize Kronecker product and cross group shuffle to obtain the final
+permutation matrices. To make the generation process learnable, based on
+theoretical analysis, softmax, orthogonal regularization, and binarization are
+employed to asymptotically approximate the permutation matrix. Dynamic shuffle
+adaptively mixes channel information with negligible extra computation and
+memory occupancy. Experiment results on image classification benchmark datasets
+CIFAR-10, CIFAR-100, Tiny ImageNet and ImageNet have shown that our method
+significantly increases ShuffleNets' performance. Adding dynamic generated
+matrix with learnable static matrix, we further propose static-dynamic-shuffle
+and show that it can serve as a lightweight replacement of ordinary pointwise
+convolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MUNCH: Modelling Unique 'N Controllable Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debayan Deb, Suvidha Tripathi, Pranit Puri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The automated generation of 3D human heads has been an intriguing and
+challenging task for computer vision researchers. Prevailing methods synthesize
+realistic avatars but with limited control over the diversity and quality of
+rendered outputs and suffer from limited correlation between shape and texture
+of the character. We propose a method that offers quality, diversity, control,
+and realism along with explainable network design, all desirable features to
+game-design artists in the domain. First, our proposed Geometry Generator
+identifies disentangled latent directions and generate novel and diverse
+samples. A Render Map Generator then learns to synthesize multiply high-fidelty
+physically-based render maps including Albedo, Glossiness, Specular, and
+Normals. For artists preferring fine-grained control over the output, we
+introduce a novel Color Transformer Model that allows semantic color control
+over generated maps. We also introduce quantifiable metrics called Uniqueness
+and Novelty and a combined metric to test the overall performance of our model.
+Demo for both shapes and textures can be found:
+https://munch-seven.vercel.app/. We will release our model along with the
+synthetic dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SHOT: Suppressing the Hessian along the Optimization Trajectory for
+  Gradient-Based Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JunHoo Lee, Jayeon Yoo, Nojun Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we hypothesize that gradient-based meta-learning (GBML)
+implicitly suppresses the Hessian along the optimization trajectory in the
+inner loop. Based on this hypothesis, we introduce an algorithm called SHOT
+(Suppressing the Hessian along the Optimization Trajectory) that minimizes the
+distance between the parameters of the target and reference models to suppress
+the Hessian in the inner loop. Despite dealing with high-order terms, SHOT does
+not increase the computational complexity of the baseline model much. It is
+agnostic to both the algorithm and architecture used in GBML, making it highly
+versatile and applicable to any GBML baseline. To validate the effectiveness of
+SHOT, we conduct empirical tests on standard few-shot learning tasks and
+qualitatively analyze its dynamics. We confirm our hypothesis empirically and
+demonstrate that SHOT outperforms the corresponding baseline. Code is available
+at: https://github.com/JunHoo-Lee/SHOT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Condition numbers in multiview geometry, instability in relative pose
+  estimation, and RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Fan, Joe Kileel, Benjamin Kimia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce a general framework for analyzing the numerical
+conditioning of minimal problems in multiple view geometry, using tools from
+computational algebra and Riemannian geometry. Special motivation comes from
+the fact that relative pose estimation, based on standard 5-point or 7-point
+Random Sample Consensus (RANSAC) algorithms, can fail even when no outliers are
+present and there is enough data to support a hypothesis. We argue that these
+cases arise due to the intrinsic instability of the 5- and 7-point minimal
+problems. We apply our framework to characterize the instabilities, both in
+terms of the world scenes that lead to infinite condition number, and directly
+in terms of ill-conditioned image data. The approach produces computational
+tests for assessing the condition number before solving the minimal problem.
+Lastly synthetic and real data experiments suggest that RANSAC serves not only
+to remove outliers, but also to select for well-conditioned image data, as
+predicted by our theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Pan-Sharpening via Generalized Inverse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Liu, Yutong Bai, Xinyang Han, Alan Yuille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening algorithm utilizes panchromatic image and multispectral image
+to obtain a high spatial and high spectral image. However, the optimizations of
+the algorithms are designed with different standards. We adopt the simple
+matrix equation to describe the Pan-sharpening problem. The solution existence
+condition and the acquirement of spectral and spatial resolution are discussed.
+A down-sampling enhancement method was introduced for better acquiring the
+spatial and spectral down-sample matrices. By the generalized inverse theory,
+we derived two forms of general inverse matrix formulations that can correspond
+to the two prominent classes of Pan-sharpening methods, that is, component
+substitution and multi-resolution analysis methods. Specifically, the Gram
+Schmidt Adaptive(GSA) was proved to follow the general inverse matrix
+formulation of component substitution. A model prior to the general inverse
+matrix of the spectral function was rendered. The theoretical errors are
+analyzed. Synthetic experiments and real data experiments are implemented. The
+proposed methods are better and sharper than other methods qualitatively in
+both synthetic and real experiments. The down-sample enhancement effect is
+shown of better results both quantitatively and qualitatively in real
+experiments. The generalized inverse matrix theory help us better understand
+the Pan-sharpening.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GETAvatar: Generative Textured Meshes for Animatable Human Avatars <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanmeng Zhang, Jianfeng Zhang, Rohan Chacko, Hongyi Xu, Guoxian Song, Yi Yang, Jiashi Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of 3D-aware full-body human generation, aiming at
+creating animatable human avatars with high-quality textures and geometries.
+Generally, two challenges remain in this field: i) existing methods struggle to
+generate geometries with rich realistic details such as the wrinkles of
+garments; ii) they typically utilize volumetric radiance fields and neural
+renderers in the synthesis process, making high-resolution rendering
+non-trivial. To overcome these problems, we propose GETAvatar, a Generative
+model that directly generates Explicit Textured 3D meshes for animatable human
+Avatar, with photo-realistic appearance and fine geometric details.
+Specifically, we first design an articulated 3D human representation with
+explicit surface modeling, and enrich the generated humans with realistic
+surface details by learning from the 2D normal maps of 3D scan data. Second,
+with the explicit mesh representation, we can use a rasterization-based
+renderer to perform surface rendering, allowing us to achieve high-resolution
+image generation efficiently. Extensive experiments demonstrate that GETAvatar
+achieves state-of-the-art performance on 3D-aware human generation both in
+appearance and geometry quality. Notably, GETAvatar can generate images at
+512x512 resolution with 17FPS and 1024x1024 resolution with 14FPS, improving
+upon previous methods by 2x. Our code and models will be available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Project Page: https://getavatar.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space
+  NeRF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jangho Park, Gihyun Kwon, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a significant advancement in text-to-image diffusion
+models, leading to groundbreaking performance in 2D image generation. These
+advancements have been extended to 3D models, enabling the generation of novel
+3D objects from textual descriptions. This has evolved into NeRF editing
+methods, which allow the manipulation of existing 3D objects through textual
+conditioning. However, existing NeRF editing techniques have faced limitations
+in their performance due to slow training speeds and the use of loss functions
+that do not adequately consider editing. To address this, here we present a
+novel 3D NeRF editing approach dubbed ED-NeRF by successfully embedding
+real-world scenes into the latent space of the latent diffusion model (LDM)
+through a unique refinement layer. This approach enables us to obtain a NeRF
+backbone that is not only faster but also more amenable to editing compared to
+traditional image space NeRF editing. Furthermore, we propose an improved loss
+function tailored for editing by migrating the delta denoising score (DDS)
+distillation loss, originally used in 2D image editing to the three-dimensional
+domain. This novel loss function surpasses the well-known score distillation
+sampling (SDS) loss in terms of suitability for editing purposes. Our
+experimental results demonstrate that ED-NeRF achieves faster editing speed
+while producing improved output quality compared to state-of-the-art 3D editing
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the Domain Gap by Clustering-based Image-Text Graph Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nokyung Park, Daewon Chae, Jeongyong Shim, Sangpil Kim, Eun-Sol Kim, Jinkyu Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning domain-invariant representations is important to train a model that
+can generalize well to unseen target task domains. Text descriptions inherently
+contain semantic structures of concepts and such auxiliary semantic cues can be
+used as effective pivot embedding for domain generalization problems. Here, we
+use multimodal graph representations, fusing images and text, to get
+domain-invariant pivot embeddings by considering the inherent semantic
+structure between local images and text descriptors. Specifically, we aim to
+learn domain-invariant features by (i) representing the image and text
+descriptions with graphs, and by (ii) clustering and matching the graph-based
+image node features into textual graphs simultaneously. We experiment with
+large-scale public datasets, such as CUB-DG and DomainBed, and our model
+achieves matched or better state-of-the-art performance on these datasets. Our
+code will be publicly available upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Dimension-Embedding-Aware Modality Fusion <span class="highlight-title">Transformer</span> for
+  Psychiatric Disorder Clasification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Wang, Xuyang Cao, Shan An, Fengmei Fan, Chao Zhang, Jinsong Wang, Feng Yu, Zhiren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning approaches, together with neuroimaging techniques, play an
+important role in psychiatric disorders classification. Previous studies on
+psychiatric disorders diagnosis mainly focus on using functional connectivity
+matrices of resting-state functional magnetic resonance imaging (rs-fMRI) as
+input, which still needs to fully utilize the rich temporal information of the
+time series of rs-fMRI data. In this work, we proposed a
+multi-dimension-embedding-aware modality fusion transformer (MFFormer) for
+schizophrenia and bipolar disorder classification using rs-fMRI and T1 weighted
+structural MRI (T1w sMRI). Concretely, to fully utilize the temporal
+information of rs-fMRI and spatial information of sMRI, we constructed a deep
+learning architecture that takes as input 2D time series of rs-fMRI and 3D
+volumes T1w. Furthermore, to promote intra-modality attention and information
+fusion across different modalities, a fusion transformer module (FTM) is
+designed through extensive self-attention of hybrid feature maps of
+multi-modality. In addition, a dimension-up and dimension-down strategy is
+suggested to properly align feature maps of multi-dimensional from different
+modalities. Experimental results on our private and public OpenfMRI datasets
+show that our proposed MFFormer performs better than that using a single
+modality or multi-modality MRI on schizophrenia and bipolar disorder diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ USB-NeRF: Unrolling Shutter Bundle Adjusted Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moyang Li, Peng Wang, Lingzhe Zhao, Bangyan Liao, Peidong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) has received much attention recently due to its
+impressive capability to represent 3D scene and synthesize novel view images.
+Existing works usually assume that the input images are captured by a global
+shutter camera. Thus, rolling shutter (RS) images cannot be trivially applied
+to an off-the-shelf NeRF algorithm for novel view synthesis. Rolling shutter
+effect would also affect the accuracy of the camera pose estimation (e.g. via
+COLMAP), which further prevents the success of NeRF algorithm with RS images.
+In this paper, we propose Unrolling Shutter Bundle Adjusted Neural Radiance
+Fields (USB-NeRF). USB-NeRF is able to correct rolling shutter distortions and
+recover accurate camera motion trajectory simultaneously under the framework of
+NeRF, by modeling the physical image formation process of a RS camera.
+Experimental results demonstrate that USB-NeRF achieves better performance
+compared to prior works, in terms of RS effect removal, novel view image
+synthesis as well as camera motion estimation. Furthermore, our algorithm can
+also be used to recover high-fidelity high frame-rate global shutter video from
+a sequence of RS images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PostRainBench: A comprehensive benchmark and a new model for
+  precipitation forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujin Tang, Jiaming Zhou, Xiang Pan, Zeying Gong, Junwei Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate precipitation forecasting is a vital challenge of both scientific
+and societal importance. Data-driven approaches have emerged as a widely used
+solution for addressing this challenge. However, solely relying on data-driven
+approaches has limitations in modeling the underlying physics, making accurate
+predictions difficult. Coupling AI-based post-processing techniques with
+traditional Numerical Weather Prediction (NWP) methods offers a more effective
+solution for improving forecasting accuracy. Despite previous post-processing
+efforts, accurately predicting heavy rainfall remains challenging due to the
+imbalanced precipitation data across locations and complex relationships
+between multiple meteorological variables. To address these limitations, we
+introduce the PostRainBench, a comprehensive multi-variable NWP post-processing
+benchmark consisting of three datasets for NWP post-processing-based
+precipitation forecasting. We propose CAMT, a simple yet effective Channel
+Attention Enhanced Multi-task Learning framework with a specially designed
+weighted loss function. Its flexible design allows for easy plug-and-play
+integration with various backbones. Extensive experimental results on the
+proposed benchmark show that our method outperforms state-of-the-art methods by
+6.3%, 4.7%, and 26.8% in rain CSI on the three datasets respectively. Most
+notably, our model is the first deep learning-based method to outperform
+traditional Numerical Weather Prediction (NWP) approaches in extreme
+precipitation conditions. It shows improvements of 15.6%, 17.4%, and 31.8% over
+NWP predictions in heavy rain CSI on respective datasets. These results
+highlight the potential impact of our model in reducing the severe consequences
+of extreme weather events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures. arXiv admin note: text overlap with
+  arXiv:2105.05537, arXiv:2206.15241 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Land-cover change detection using paired OpenStreetMap data and optical
+  high-resolution imagery via object-guided <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongruixuan Chen, Cuiling Lan, Jian Song, Clifford Broni-Bediako, Junshi Xia, Naoto Yokoya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical high-resolution imagery and OpenStreetMap (OSM) data are two
+important data sources for land-cover change detection. Previous studies in
+these two data sources focus on utilizing the information in OSM data to aid
+the change detection on multi-temporal optical high-resolution images. This
+paper pioneers the direct detection of land-cover changes utilizing paired OSM
+data and optical imagery, thereby broadening the horizons of change detection
+tasks to encompass more dynamic earth observations. To this end, we propose an
+object-guided Transformer (ObjFormer) architecture by naturally combining the
+prevalent object-based image analysis (OBIA) technique with the advanced vision
+Transformer architecture. The introduction of OBIA can significantly reduce the
+computational overhead and memory burden in the self-attention module.
+Specifically, the proposed ObjFormer has a hierarchical pseudo-siamese encoder
+consisting of object-guided self-attention modules that extract representative
+features of different levels from OSM data and optical images; a decoder
+consisting of object-guided cross-attention modules can progressively recover
+the land-cover changes from the extracted heterogeneous features. In addition
+to the basic supervised binary change detection task, this paper raises a new
+semi-supervised semantic change detection task that does not require any
+manually annotated land-cover labels of optical images to train semantic change
+detectors. Two lightweight semantic decoders are added to ObjFormer to
+accomplish this task efficiently. A converse cross-entropy loss is designed to
+fully utilize the negative samples, thereby contributing to the great
+performance improvement in this task. The first large-scale benchmark dataset
+containing 1,287 map-image pairs (1024$\times$ 1024 pixels for each sample)
+covering 40 regions on six continents ...(see the manuscript for the full
+abstract)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Memorization in Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Gu, Chao Du, Tianyu Pang, Chongxuan Li, Min Lin, Ye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to their capacity to generate novel and high-quality samples, diffusion
+models have attracted significant research interest in recent years. Notably,
+the typical training objective of diffusion models, i.e., denoising score
+matching, has a closed-form optimal solution that can only generate training
+data replicating samples. This indicates that a memorization behavior is
+theoretically expected, which contradicts the common generalization ability of
+state-of-the-art diffusion models, and thus calls for a deeper understanding.
+Looking into this, we first observe that memorization behaviors tend to occur
+on smaller-sized datasets, which motivates our definition of effective model
+memorization (EMM), a metric measuring the maximum size of training data at
+which a learned diffusion model approximates its theoretical optimum. Then, we
+quantify the impact of the influential factors on these memorization behaviors
+in terms of EMM, focusing primarily on data distribution, model configuration,
+and training procedure. Besides comprehensive empirical results identifying the
+influential factors, we surprisingly find that conditioning training data on
+uninformative random labels can significantly trigger the memorization in
+diffusion models. Our study holds practical significance for diffusion model
+users and offers clues to theoretical research in deep generative models. Code
+is available at https://github.com/sail-sg/DiffMemorize.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Med<span class="highlight-title">Prompt</span>: Cross-Modal <span class="highlight-title">Prompt</span>ing for Multi-Task Medical Image
+  Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhang Chen, Chi-Man Pun, Shuqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal medical image translation is an essential task for synthesizing
+missing modality data for clinical diagnosis. However, current learning-based
+techniques have limitations in capturing cross-modal and global features,
+restricting their suitability to specific pairs of modalities. This lack of
+versatility undermines their practical usefulness, particularly considering
+that the missing modality may vary for different cases. In this study, we
+present MedPrompt, a multi-task framework that efficiently translates different
+modalities. Specifically, we propose the Self-adaptive Prompt Block, which
+dynamically guides the translation network towards distinct modalities. Within
+this framework, we introduce the Prompt Extraction Block and the Prompt Fusion
+Block to efficiently encode the cross-modal prompt. To enhance the extraction
+of global features across diverse modalities, we incorporate the Transformer
+model. Extensive experimental results involving five datasets and four pairs of
+modalities demonstrate that our proposed model achieves state-of-the-art visual
+quality and exhibits excellent generalization capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Active Visual Localization for Multi-Agent Collaboration: A Data-Driven
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Hanlon, Boyang Sun, Marc Pollefeys, Hermann Blum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rather than having each newly deployed robot create its own map of its
+surroundings, the growing availability of SLAM-enabled devices provides the
+option of simply localizing in a map of another robot or device. In cases such
+as multi-robot or human-robot collaboration, localizing all agents in the same
+map is even necessary. However, localizing e.g. a ground robot in the map of a
+drone or head-mounted MR headset presents unique challenges due to viewpoint
+changes. This work investigates how active visual localization can be used to
+overcome such challenges of viewpoint changes. Specifically, we focus on the
+problem of selecting the optimal viewpoint at a given location. We compare
+existing approaches in the literature with additional proposed baselines and
+propose a novel data-driven approach. The result demonstrates the superior
+performance of the data-driven approach when compared to existing methods, both
+in controlled simulation experiments and real-world deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GET: Group Event <span class="highlight-title">Transformer</span> for Event-Based Vision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yansong Peng, Yueyi Zhang, Zhiwei Xiong, Xiaoyan Sun, Feng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras are a type of novel neuromorphic sen-sor that has been gaining
+increasing attention. Existing event-based backbones mainly rely on image-based
+designs to extract spatial information within the image transformed from
+events, overlooking important event properties like time and polarity. To
+address this issue, we propose a novel Group-based vision Transformer backbone
+for Event-based vision, called Group Event Transformer (GET), which de-couples
+temporal-polarity information from spatial infor-mation throughout the feature
+extraction process. Specifi-cally, we first propose a new event representation
+for GET, named Group Token, which groups asynchronous events based on their
+timestamps and polarities. Then, GET ap-plies the Event Dual Self-Attention
+block, and Group Token Aggregation module to facilitate effective feature
+commu-nication and integration in both the spatial and temporal-polarity
+domains. After that, GET can be integrated with different downstream tasks by
+connecting it with vari-ous heads. We evaluate our method on four event-based
+classification datasets (Cifar10-DVS, N-MNIST, N-CARS, and DVS128Gesture) and
+two event-based object detection datasets (1Mpx and Gen1), and the results
+demonstrate that GET outperforms other state-of-the-art methods. The code is
+available at https://github.com/Peterande/GET-Group-Event-Transformer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deformation-Invariant Neural Network and Its Applications in Distorted
+  Image Restoration and Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhang, Qiguang Chen, Lok Ming Lui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images degraded by geometric distortions pose a significant challenge to
+imaging and computer vision tasks such as object recognition. Deep
+learning-based imaging models usually fail to give accurate performance for
+geometrically distorted images. In this paper, we propose the
+deformation-invariant neural network (DINN), a framework to address the problem
+of imaging tasks for geometrically distorted images. The DINN outputs
+consistent latent features for images that are geometrically distorted but
+represent the same underlying object or scene. The idea of DINN is to
+incorporate a simple component, called the quasiconformal transformer network
+(QCTN), into other existing deep networks for imaging tasks. The QCTN is a deep
+neural network that outputs a quasiconformal map, which can be used to
+transform a geometrically distorted image into an improved version that is
+closer to the distribution of natural or good images. It first outputs a
+Beltrami coefficient, which measures the quasiconformality of the output
+deformation map. By controlling the Beltrami coefficient, the local geometric
+distortion under the quasiconformal mapping can be controlled. The QCTN is
+lightweight and simple, which can be readily integrated into other existing
+deep neural networks to enhance their performance. Leveraging our framework, we
+have developed an image classification network that achieves accurate
+classification of distorted images. Our proposed framework has been applied to
+restore geometrically distorted images by atmospheric turbulence and water
+turbulence. DINN outperforms existing GAN-based restoration methods under these
+scenarios, demonstrating the effectiveness of the proposed framework.
+Additionally, we apply our proposed framework to the 1-1 verification of human
+face images under atmospheric turbulence and achieve satisfactory performance,
+further demonstrating the efficacy of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ P2CADNet: An End-to-End Reconstruction Network for Parametric 3D CAD
+  Model from Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Zong, Fazhi He, Rubin Fan, Yuxin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer Aided Design (CAD), especially the feature-based parametric CAD,
+plays an important role in modern industry and society. However, the
+reconstruction of featured CAD model is more challenging than the
+reconstruction of other CAD models. To this end, this paper proposes an
+end-to-end network to reconstruct featured CAD model from point cloud
+(P2CADNet). Initially, the proposed P2CADNet architecture combines a point
+cloud feature extractor, a CAD sequence reconstructor and a parameter
+optimizer. Subsequently, in order to reconstruct the featured CAD model in an
+autoregressive way, the CAD sequence reconstructor applies two transformer
+decoders, one with target mask and the other without mask. Finally, for
+predicting parameters more precisely, we design a parameter optimizer with
+cross-attention mechanism to further refine the CAD feature parameters. We
+evaluate P2CADNet on the public dataset, and the experimental results show that
+P2CADNet has excellent reconstruction quality and accuracy. To our best
+knowledge, P2CADNet is the first end-to-end network to reconstruct featured CAD
+model from point cloud, and can be regarded as baseline for future works.
+Therefore, we open the source code at https://github.com/Blice0415/P2CADNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing and Improving OT-based Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemoo Choi, Jaewoong Choi, Myungjoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) problem aims to find a transport plan that bridges two
+distributions while minimizing a given cost function. OT theory has been widely
+utilized in generative modeling. In the beginning, OT distance has been used as
+a measure for assessing the distance between data and generated distributions.
+Recently, OT transport map between data and prior distributions has been
+utilized as a generative model. These OT-based generative models share a
+similar adversarial training objective. In this paper, we begin by unifying
+these OT-based adversarial methods within a single framework. Then, we
+elucidate the role of each component in training dynamics through a
+comprehensive analysis of this unified framework. Moreover, we suggest a simple
+but novel method that improves the previously best-performing OT-based model.
+Intuitively, our approach conducts a gradual refinement of the generated
+distribution, progressively aligning it with the data distribution. Our
+approach achieves a FID score of 2.51 on CIFAR-10, outperforming unified
+OT-based adversarial approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MagicDrive: Street View Generation with Diverse 3D Geometry Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyuan Gao, Kai Chen, Enze Xie, Lanqing Hong, Zhenguo Li, Dit-Yan Yeung, Qiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in diffusion models have significantly enhanced the data
+synthesis with 2D control. Yet, precise 3D control in street view generation,
+crucial for 3D perception tasks, remains elusive. Specifically, utilizing
+Bird's-Eye View (BEV) as the primary condition often leads to challenges in
+geometry control (e.g., height), affecting the representation of object shapes,
+occlusion patterns, and road surface elevations, all of which are essential to
+perception data synthesis, especially for 3D object detection tasks. In this
+paper, we introduce MagicDrive, a novel street view generation framework
+offering diverse 3D geometry controls, including camera poses, road maps, and
+3D bounding boxes, together with textual descriptions, achieved through
+tailored encoding strategies. Besides, our design incorporates a cross-view
+attention module, ensuring consistency across multiple camera views. With
+MagicDrive, we achieve high-fidelity street-view synthesis that captures
+nuanced 3D geometry and various scene descriptions, enhancing tasks like BEV
+segmentation and 3D object detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://flymin.github.io/magicdrive</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SweetDreamer: Aligning Geometric Priors in 2D Diffusion for Consistent
+  Text-to-3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyu Li, Rui Chen, Xuelin Chen, Ping Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is inherently ambiguous to lift 2D results from pre-trained diffusion
+models to a 3D world for text-to-3D generation. 2D diffusion models solely
+learn view-agnostic priors and thus lack 3D knowledge during the lifting,
+leading to the multi-view inconsistency problem. We find that this problem
+primarily stems from geometric inconsistency, and avoiding misplaced geometric
+structures substantially mitigates the problem in the final outputs. Therefore,
+we improve the consistency by aligning the 2D geometric priors in diffusion
+models with well-defined 3D shapes during the lifting, addressing the vast
+majority of the problem. This is achieved by fine-tuning the 2D diffusion model
+to be viewpoint-aware and to produce view-specific coordinate maps of
+canonically oriented 3D objects. In our process, only coarse 3D information is
+used for aligning. This "coarse" alignment not only resolves the multi-view
+inconsistency in geometries but also retains the ability in 2D diffusion models
+to generate detailed and diversified high-quality objects unseen in the 3D
+datasets. Furthermore, our aligned geometric priors (AGP) are generic and can
+be seamlessly integrated into various state-of-the-art pipelines, obtaining
+high generalizability in terms of unseen shapes and visual appearance while
+greatly alleviating the multi-view inconsistency problem. Our method represents
+a new state-of-the-art performance with an 85+% consistency rate by human
+evaluation, while many previous methods are around 30%. Our project page is
+https://sweetdreamer3d.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sweetdreamer3d.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViT-ReciproCAM: Gradient and Attention-Free Visual Explanations for
+  Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seok-Yong Byun, Wonju Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to address the challenges of
+understanding the prediction process and debugging prediction errors in Vision
+Transformers (ViT), which have demonstrated superior performance in various
+computer vision tasks such as image classification and object detection. While
+several visual explainability techniques, such as CAM, Grad-CAM, Score-CAM, and
+Recipro-CAM, have been extensively researched for Convolutional Neural Networks
+(CNNs), limited research has been conducted on ViT. Current state-of-the-art
+solutions for ViT rely on class agnostic Attention-Rollout and Relevance
+techniques. In this work, we propose a new gradient-free visual explanation
+method for ViT, called ViT-ReciproCAM, which does not require attention matrix
+and gradient information. ViT-ReciproCAM utilizes token masking and generated
+new layer outputs from the target layer's input to exploit the correlation
+between activated tokens and network predictions for target classes. Our
+proposed method outperforms the state-of-the-art Relevance method in the
+Average Drop-Coherence-Complexity (ADCC) metric by $4.58\%$ to $5.80\%$ and
+generates more localized saliency maps. Our experiments demonstrate the
+effectiveness of ViT-ReciproCAM and showcase its potential for understanding
+and debugging ViT models. Our proposed method provides an efficient and
+easy-to-implement alternative for generating visual explanations, without
+requiring attention and gradient information, which can be beneficial for
+various applications in the field of computer vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Prototype-Based Neural Network for Image Anomaly Detection and
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Huang, Zhao Kang, Hong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image anomaly detection and localization perform not only image-level anomaly
+classification but also locate pixel-level anomaly regions. Recently, it has
+received much research attention due to its wide application in various fields.
+This paper proposes ProtoAD, a prototype-based neural network for image anomaly
+detection and localization. First, the patch features of normal images are
+extracted by a deep network pre-trained on nature images. Then, the prototypes
+of the normal patch features are learned by non-parametric clustering. Finally,
+we construct an image anomaly localization network (ProtoAD) by appending the
+feature extraction network with $L2$ feature normalization, a $1\times1$
+convolutional layer, a channel max-pooling, and a subtraction operation. We use
+the prototypes as the kernels of the $1\times1$ convolutional layer; therefore,
+our neural network does not need a training phase and can conduct anomaly
+detection and localization in an end-to-end manner. Extensive experiments on
+two challenging industrial anomaly detection datasets, MVTec AD and BTAD,
+demonstrate that ProtoAD achieves competitive performance compared to the
+state-of-the-art methods with a higher inference speed. The source code is
+available at: https://github.com/98chao/ProtoAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaMerging: Adaptive Model Merging for Multi-Task Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enneng Yang, Zhenyi Wang, Li Shen, Shiwei Liu, Guibing Guo, Xingwei Wang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-task learning (MTL) aims to empower a model to tackle multiple tasks
+simultaneously. A recent development known as task arithmetic has revealed that
+several models, each fine-tuned for distinct tasks, can be directly merged into
+a single model to execute MTL without necessitating a retraining process using
+the initial training data. Nevertheless, this direct addition of models often
+leads to a significant deterioration in the overall performance of the merged
+model. This decline occurs due to potential conflicts and intricate
+correlations among the multiple tasks. Consequently, the challenge emerges of
+how to merge pre-trained models more effectively without using their original
+training data. This paper introduces an innovative technique called Adaptive
+Model Merging (AdaMerging). This approach aims to autonomously learn the
+coefficients for model merging, either in a task-wise or layer-wise manner,
+without relying on the original training data. Specifically, our AdaMerging
+method operates as an automatic, unsupervised task arithmetic scheme. It
+leverages entropy minimization on unlabeled test samples from the multi-task
+setup as a surrogate objective function to iteratively refine the merging
+coefficients of the multiple models. Our experimental findings across eight
+tasks demonstrate the efficacy of the AdaMerging scheme we put forth. Compared
+to the current state-of-the-art task arithmetic merging scheme, AdaMerging
+showcases a remarkable 11\% improvement in performance. Notably, AdaMerging
+also exhibits superior generalization capabilities when applied to unseen
+downstream tasks. Furthermore, it displays a significantly enhanced robustness
+to data distribution shifts that may occur during the testing phase.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReForm-Eval: Evaluating Large Vision Language Models via Unified
+  Re-Formulation of Task-Oriented Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02569v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02569v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zejun Li, Ye Wang, Mengfei Du, Qingwen Liu, Binhao Wu, Jiwen Zhang, Chengxing Zhou, Zhihao Fan, Jie Fu, Jingjing Chen, Xuanjing Huang, Zhongyu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed remarkable progress in the development of large
+vision-language models (LVLMs). Benefiting from the strong language backbones
+and efficient cross-modal alignment strategies, LVLMs exhibit surprising
+capabilities to perceive visual signals and perform visually grounded
+reasoning. However, the capabilities of LVLMs have not been comprehensively and
+quantitatively evaluate. Most existing multi-modal benchmarks require
+task-oriented input-output formats, posing great challenges to automatically
+assess the free-form text output of LVLMs. To effectively leverage the
+annotations available in existing benchmarks and reduce the manual effort
+required for constructing new benchmarks, we propose to re-formulate existing
+benchmarks into unified LVLM-compatible formats. Through systematic data
+collection and reformulation, we present the ReForm-Eval benchmark, offering
+substantial data for evaluating various capabilities of LVLMs. Based on
+ReForm-Eval, we conduct extensive experiments, thoroughly analyze the strengths
+and weaknesses of existing LVLMs, and identify the underlying factors. Our
+benchmark and evaluation framework will be open-sourced as a cornerstone for
+advancing the development of LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 11 figures, 24 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Automatic VQA Evaluation Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Mañas, Benno Krojer, Aishwarya Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  8 years after the visual question answering (VQA) task was proposed, accuracy
+remains the primary metric for automatic evaluation. VQA Accuracy has been
+effective so far in the IID evaluation setting. However, our community is
+undergoing a shift towards open-ended generative models and OOD evaluation. In
+this new paradigm, the existing VQA Accuracy metric is overly stringent and
+underestimates the performance of VQA systems. Thus, there is a need to develop
+more robust automatic VQA metrics that serve as a proxy for human judgment. In
+this work, we propose to leverage the in-context learning capabilities of
+instruction-tuned large language models (LLMs) to build a better VQA metric. We
+formulate VQA evaluation as an answer-rating task where the LLM is instructed
+to score the accuracy of a candidate answer given a set of reference answers.
+We demonstrate the proposed metric better correlates with human judgment
+compared to existing metrics across several VQA models and benchmarks. We hope
+wide adoption of our metric will contribute to better estimating the research
+progress on the VQA task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization in diffusion models arises from geometry-adaptive
+  harmonic representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02557v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02557v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Kadkhodaie, Florentin Guth, Eero P. Simoncelli, Stéphane Mallat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-quality samples generated with score-based reverse diffusion algorithms
+provide evidence that deep neural networks (DNN) trained for denoising can
+learn high-dimensional densities, despite the curse of dimensionality. However,
+recent reports of memorization of the training set raise the question of
+whether these networks are learning the "true" continuous density of the data.
+Here, we show that two denoising DNNs trained on non-overlapping subsets of a
+dataset learn nearly the same score function, and thus the same density, with a
+surprisingly small number of training images. This strong generalization
+demonstrates an alignment of powerful inductive biases in the DNN architecture
+and/or training algorithm with properties of the data distribution. We analyze
+these, demonstrating that the denoiser performs a shrinkage operation in a
+basis adapted to the underlying image. Examination of these bases reveals
+oscillating harmonic structures along contours and in homogeneous image
+regions. We show that trained denoisers are inductively biased towards these
+geometry-adaptive harmonic representations by demonstrating that they arise
+even when the network is trained on image classes such as low-dimensional
+manifolds, for which the harmonic basis is suboptimal. Additionally, we show
+that the denoising performance of the networks is near-optimal when trained on
+regular image classes for which the optimal basis is known to be
+geometry-adaptive and harmonic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NOLA: Networks as Linear Combination of Low Rank Random Basis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Abbasi Koohpayegani, KL Navaneet, Parsa Nooralinejad, Soheil Kolouri, Hamed Pirsiavash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently gained popularity due to their
+impressive few-shot performance across various downstream tasks. However,
+fine-tuning all parameters and storing a unique model for each downstream task
+or domain becomes impractical because of the massive size of checkpoints (e.g.,
+350GB in GPT-3). Current literature, such as LoRA, showcases the potential of
+low-rank modifications to the original weights of an LLM, enabling efficient
+adaptation and storage for task-specific models. These methods can reduce the
+number of parameters needed to fine-tune an LLM by several orders of magnitude.
+Yet, these methods face two primary limitations: 1) the parameter reduction is
+lower-bounded by the rank one decomposition, and 2) the extent of reduction is
+heavily influenced by both the model architecture and the chosen rank. For
+instance, in larger models, even a rank one decomposition might exceed the
+number of parameters truly needed for adaptation. In this paper, we introduce
+NOLA, which overcomes the rank one lower bound present in LoRA. It achieves
+this by re-parameterizing the low-rank matrices in LoRA using linear
+combinations of randomly generated matrices (basis) and optimizing the linear
+mixture coefficients only. This approach allows us to decouple the number of
+trainable parameters from both the choice of rank and the network architecture.
+We present adaptation results using GPT-2 and ViT in natural language and
+computer vision tasks. NOLA performs as well as, or better than models with
+equivalent parameter counts. Furthermore, we demonstrate that we can halve the
+parameters in larger models compared to LoRA with rank one, without sacrificing
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is available here: https://github.com/UCDvision/NOLA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SlowFormer: Universal Adversarial Patch for Attack on Compute and Energy
+  Efficiency of Inference Efficient Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        KL Navaneet, Soroush Abbasi Koohpayegani, Essam Sleiman, Hamed Pirsiavash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a lot of progress in reducing the computation of
+deep models at inference time. These methods can reduce both the computational
+needs and power usage of deep models. Some of these approaches adaptively scale
+the compute based on the input instance. We show that such models can be
+vulnerable to a universal adversarial patch attack, where the attacker
+optimizes for a patch that when pasted on any image, can increase the compute
+and power consumption of the model. We run experiments with three different
+efficient vision transformer methods showing that in some cases, the attacker
+can increase the computation to the maximum possible level by simply pasting a
+patch that occupies only 8\% of the image area. We also show that a standard
+adversarial training defense method can reduce some of the attack's success. We
+believe adaptive efficient methods will be necessary for the future to lower
+the power usage of deep models, so we hope our paper encourages the community
+to study the robustness of these methods and develop better defense methods for
+the proposed attack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/UCDvision/SlowFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ShaSTA-Fuse: Camera-LiDAR Sensor Fusion to Model Shape and
+  Spatio-Temporal Affinities for 3D Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tara Sadjadpour, Rares Ambrus, Jeannette Bohg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D multi-object tracking (MOT) is essential for an autonomous mobile agent to
+safely navigate a scene. In order to maximize the perception capabilities of
+the autonomous agent, we aim to develop a 3D MOT framework that fuses camera
+and LiDAR sensor information. Building on our prior LiDAR-only work, ShaSTA,
+which models shape and spatio-temporal affinities for 3D MOT, we propose a
+novel camera-LiDAR fusion approach for learning affinities. At its core, this
+work proposes a fusion technique that generates a rich sensory signal
+incorporating information about depth and distant objects to enhance affinity
+estimation for improved data association, track lifecycle management,
+false-positive elimination, false-negative propagation, and track confidence
+score refinement. Our main contributions include a novel fusion approach for
+combining camera and LiDAR sensory signals to learn affinities, and a
+first-of-its-kind multimodal sequential track confidence refinement technique
+that fuses 2D and 3D detections. Additionally, we perform an ablative analysis
+on each fusion step to demonstrate the added benefits of incorporating the
+camera sensor, particular for small, distant objects that tend to suffer from
+the depth-sensing limits and sparsity of LiDAR sensors. In sum, our technique
+achieves state-of-the-art performance on the nuScenes benchmark amongst
+multimodal 3D MOT algorithms using CenterPoint detections.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Cognition of Visual Question Answering Models and Human
+  Intelligence: A Comparative Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liben Chen, Long Chen, Tian Ellison-Chen, Zhuoyuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is a challenging task that requires
+cross-modal understanding and reasoning of visual image and natural language
+question. To inspect the association of VQA models to human cognition, we
+designed a survey to record human thinking process and analyzed VQA models by
+comparing the outputs and attention maps with those of humans. We found that
+although the VQA models resemble human cognition in architecture and performs
+similarly with human on the recognition-level, they still struggle with
+cognitive inferences. The analysis of human thinking procedure serves to direct
+future research and introduce more cognitive capacity into modeling features
+and architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Spatio-Temporal Attention-Based Method for Detecting Student Classroom
+  Behaviors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately detecting student behavior from classroom videos is beneficial for
+analyzing their classroom status and improving teaching efficiency. However,
+low accuracy in student classroom behavior detection is a prevalent issue. To
+address this issue, we propose a Spatio-Temporal Attention-Based Method for
+Detecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is
+used to generate motion and environmental information feature maps from the
+video. Then, the spatio-temporal attention module is applied to the feature
+maps, including information aggregation, compression and stimulation processes.
+Subsequently, attention maps in the time, channel and space dimensions are
+obtained, and multi-label behavior classification is performed based on these
+attention maps. To solve the long-tail data problem that exists in student
+classroom behavior datasets, we use an improved focal loss function to assign
+more weight to the tail class data during training. Experimental results are
+conducted on a self-made student classroom behavior dataset named STSCB.
+Compared with the SlowFast model, the average accuracy of student behavior
+classification detection improves by 8.94\% using BDSTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCB-<span class="highlight-title">Dataset</span>3: A Benchmark for Detecting Student Classroom Behavior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang, Tao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of deep learning methods to automatically detect students' classroom
+behavior is a promising approach for analyzing their class performance and
+improving teaching effectiveness. However, the lack of publicly available
+datasets on student behavior poses a challenge for researchers in this field.
+To address this issue, we propose the Student Classroom Behavior dataset
+(SCB-dataset3), which represents real-life scenarios. Our dataset comprises
+5686 images with 45578 labels, focusing on six behaviors: hand-raising,
+reading, writing, using a phone, bowing the head, and leaning over the table.
+We evaluated the dataset using the YOLOv5, YOLOv7, and YOLOv8 algorithms,
+achieving a mean average precision (map) of up to 80.3$\%$. We believe that our
+dataset can serve as a robust foundation for future research in student
+behavior detection and contribute to advancements in this field. Our
+SCB-dataset3 is available for download at:
+https://github.com/Whiffe/SCB-dataset
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.02488,
+  arXiv:2306.03318</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Performance of Multimodal Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Utsav Garg, Erhan Bas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-tuned large language models (LLMs) have demonstrated promising
+zero-shot generalization capabilities across various downstream tasks. Recent
+research has introduced multimodal capabilities to LLMs by integrating
+independently pretrained vision encoders through model grafting. These
+multimodal variants undergo instruction tuning, similar to LLMs, enabling
+effective zero-shot generalization for multimodal tasks. This study conducts a
+comparative analysis of different multimodal instruction tuning approaches and
+evaluates their performance across a range of tasks, including complex
+reasoning, conversation, image captioning, multiple-choice questions (MCQs),
+and binary classification. Through rigorous benchmarking and ablation
+experiments, we reveal key insights for guiding architectural choices when
+incorporating multimodal capabilities into LLMs. However, current approaches
+have limitations; they do not sufficiently address the need for a diverse
+multimodal instruction dataset, which is crucial for enhancing task
+generalization. Additionally, they overlook issues related to truthfulness and
+factuality when generating responses. These findings illuminate current
+methodological constraints in adapting language models for image comprehension
+and provide valuable guidance for researchers and practitioners seeking to
+harness multimodal versions of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Large-Scale 3D Face Mesh Video <span class="highlight-title">Dataset</span> via Neural Re-parameterized
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kim Youwang, Lee Hyun, Kim Sung-Bin, Suekyeong Nam, Janghoon Ju, Tae-Hyun Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose NeuFace, a 3D face mesh pseudo annotation method on videos via
+neural re-parameterized optimization. Despite the huge progress in 3D face
+reconstruction methods, generating reliable 3D face labels for in-the-wild
+dynamic videos remains challenging. Using NeuFace optimization, we annotate the
+per-view/-frame accurate and consistent face meshes on large-scale face videos,
+called the NeuFace-dataset. We investigate how neural re-parameterization helps
+to reconstruct image-aligned facial details on 3D meshes via gradient analysis.
+By exploiting the naturalness and diversity of 3D faces in our dataset, we
+demonstrate the usefulness of our dataset for 3D face-related tasks: improving
+the reconstruction accuracy of an existing 3D face reconstruction model and
+learning 3D facial motion prior. Code and datasets will be available at
+https://neuface-dataset.github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures, and 3 tables for the main paper. 8 pages, 6
+  figures and 3 tables for the appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust and Interpretable Medical Image Classifiers via Concept
+  Bottleneck Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Yan, Yu Wang, Yiwu Zhong, Zexue He, Petros Karypis, Zihan Wang, Chengyu Dong, Amilcare Gentili, Chun-Nan Hsu, Jingbo Shang, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification is a critical problem for healthcare, with the
+potential to alleviate the workload of doctors and facilitate diagnoses of
+patients. However, two challenges arise when deploying deep learning models to
+real-world healthcare applications. First, neural models tend to learn spurious
+correlations instead of desired features, which could fall short when
+generalizing to new domains (e.g., patients with different ages). Second, these
+black-box models lack interpretability. When making diagnostic predictions, it
+is important to understand why a model makes a decision for trustworthy and
+safety considerations. In this paper, to address these two limitations, we
+propose a new paradigm to build robust and interpretable medical image
+classifiers with natural language concepts. Specifically, we first query
+clinical concepts from GPT-4, then transform latent image features into
+explicit concepts with a vision-language model. We systematically evaluate our
+method on eight medical image classification datasets to verify its
+effectiveness. On challenging datasets with strong confounding factors, our
+method can mitigate spurious correlations thus substantially outperform
+standard visual encoders and other baselines. Finally, we show how
+classification with a small number of concepts brings a level of
+interpretability for understanding model decisions through case studies in real
+medical data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attributing Learned Concepts in Neural Networks to Training Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Konz, Charles Godfrey, Madelyn Shapiro, Jonathan Tu, Henry Kvinge, Davis Brown
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By now there is substantial evidence that deep learning models learn certain
+human-interpretable features as part of their internal representations of data.
+As having the right (or wrong) concepts is critical to trustworthy machine
+learning systems, it is natural to ask which inputs from the model's original
+training set were most important for learning a concept at a given layer. To
+answer this, we combine data attribution methods with methods for probing the
+concepts learned by a model. Training network and probe ensembles for two
+concept datasets on a range of network layers, we use the recently developed
+TRAK method for large-scale data attribution. We find some evidence for
+convergence, where removing the 10,000 top attributing images for a concept and
+retraining the model does not change the location of the concept in the network
+nor the probing sparsity of the concept. This suggests that rather than being
+highly dependent on a few specific examples, the features that inform the
+development of a concept are spread in a more diffuse manner across its
+exemplars, implying robustness in concept formation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time
+  Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryan Bo Cao, Abrar Alali, Hansi Liu, Nicholas Meegan, Marco Gruteser, Kristin Dana, Ashwin Ashok, Shubham Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking subjects in videos is one of the most widely used functions in
+camera-based IoT applications such as security surveillance, smart city traffic
+safety enhancement, vehicle to pedestrian communication and so on. In the
+computer vision domain, tracking is usually achieved by first detecting
+subjects with bounding boxes, then associating detected bounding boxes across
+video frames. For many IoT systems, images captured by cameras are usually sent
+over the network to be processed at a different site that has more powerful
+computing resources than edge devices. However, sending entire frames through
+the network causes significant bandwidth consumption that may exceed the system
+bandwidth constraints. To tackle this problem, we propose ViFiT, a
+transformer-based model that reconstructs vision bounding box trajectories from
+phone data (IMU and Fine Time Measurements). It leverages a transformer ability
+of better modeling long-term time series data. ViFiT is evaluated on Vi-Fi
+Dataset, a large-scale multimodal dataset in 5 diverse real world scenes,
+including indoor and outdoor environments. To fill the gap of proper metrics of
+jointly capturing the system characteristics of both tracking quality and video
+bandwidth reduction, we propose a novel evaluation framework dubbed Minimum
+Required Frames (MRF) and Minimum Required Frames Ratio (MRFR). ViFiT achieves
+an MRFR of 0.65 that outperforms the state-of-the-art approach for cross-modal
+reconstruction in LSTM Encoder-Decoder architecture X-Translator of 0.98,
+resulting in a high frame reduction rate as 97.76%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 12 figures, 9 tables. MobiCom 2023 ISACom</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shielding the Unseen: Privacy Protection through Poisoning NeRF with
+  Spatial Deformation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihan Wu, Brandon Y. Feng, Heng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an innovative method of safeguarding user privacy
+against the generative capabilities of Neural Radiance Fields (NeRF) models.
+Our novel poisoning attack method induces changes to observed views that are
+imperceptible to the human eye, yet potent enough to disrupt NeRF's ability to
+accurately reconstruct a 3D scene. To achieve this, we devise a bi-level
+optimization algorithm incorporating a Projected Gradient Descent (PGD)-based
+spatial deformation. We extensively test our approach on two common NeRF
+benchmark datasets consisting of 29 real-world scenes with high-quality images.
+Our results compellingly demonstrate that our privacy-preserving method
+significantly impairs NeRF's performance across these benchmark datasets.
+Additionally, we show that our method is adaptable and versatile, functioning
+across various perturbation strengths and NeRF architectures. This work offers
+valuable insights into NeRF's vulnerabilities and emphasizes the need to
+account for such potential privacy risks when developing robust 3D scene
+reconstruction algorithms. Our study contributes to the larger conversation
+surrounding responsible AI and generative machine learning, aiming to protect
+user privacy and respect creative ownership in the digital age.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blind CT Image Quality Assessment Using DDPM-derived Content and
+  <span class="highlight-title">Transformer</span>-based Evaluator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Shi, Wenjun Xia, Ge Wang, Xuanqin Mou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lowering radiation dose per view and utilizing sparse views per scan are two
+common CT scan modes, albeit often leading to distorted images characterized by
+noise and streak artifacts. Blind image quality assessment (BIQA) strives to
+evaluate perceptual quality in alignment with what radiologists perceive, which
+plays an important role in advancing low-dose CT reconstruction techniques. An
+intriguing direction involves developing BIQA methods that mimic the
+operational characteristic of the human visual system (HVS). The internal
+generative mechanism (IGM) theory reveals that the HVS actively deduces primary
+content to enhance comprehension. In this study, we introduce an innovative
+BIQA metric that emulates the active inference process of IGM. Initially, an
+active inference module, implemented as a denoising diffusion probabilistic
+model (DDPM), is constructed to anticipate the primary content. Then, the
+dissimilarity map is derived by assessing the interrelation between the
+distorted image and its primary content. Subsequently, the distorted image and
+dissimilarity map are combined into a multi-channel image, which is inputted
+into a transformer-based image quality evaluator. Remarkably, by exclusively
+utilizing this transformer-based quality evaluator, we won the second place in
+the MICCAI 2023 low-dose computed tomography perceptual image quality
+assessment grand challenge. Leveraging the DDPM-derived primary content, our
+approach further improves the performance on the challenge dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning-based Mixture of Vision <span class="highlight-title">Transformer</span>s for Video
+  Violence Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamid Mohammadi, Ehsan Nazerfard, Tahereh Firoozi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video violence recognition based on deep learning concerns accurate yet
+scalable human violence recognition. Currently, most state-of-the-art video
+violence recognition studies use CNN-based models to represent and categorize
+videos. However, recent studies suggest that pre-trained transformers are more
+accurate than CNN-based models on various video analysis benchmarks. Yet these
+models are not thoroughly evaluated for video violence recognition. This paper
+introduces a novel transformer-based Mixture of Experts (MoE) video violence
+recognition system. Through an intelligent combination of large vision
+transformers and efficient transformer architectures, the proposed system not
+only takes advantage of the vision transformer architecture but also reduces
+the cost of utilizing large vision transformers. The proposed architecture
+maximizes violence recognition system accuracy while actively reducing
+computational costs through a reinforcement learning-based router. The
+empirical results show the proposed MoE architecture's superiority over
+CNN-based models by achieving 92.4% accuracy on the RWF dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Creating an Atlas of Normal Tissue for Pruning WSI Patching Through
+  Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peyman Nejat, Areej Alsaafin, Ghazal Alabtah, Nneka Comfere, Aaron Mangold, Dennis Murphree, Patricija Zot, Saba Yasir, Joaquin J. Garcia, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Patching gigapixel whole slide images (WSIs) is an important task in
+computational pathology. Some methods have been proposed to select a subset of
+patches as WSI representation for downstream tasks. While most of the
+computational pathology tasks are designed to classify or detect the presence
+of pathological lesions in each WSI, the confounding role and redundant nature
+of normal histology in tissue samples are generally overlooked in WSI
+representations. In this paper, we propose and validate the concept of an
+"atlas of normal tissue" solely using samples of WSIs obtained from normal
+tissue biopsies. Such atlases can be employed to eliminate normal fragments of
+tissue samples and hence increase the representativeness collection of patches.
+We tested our proposed method by establishing a normal atlas using 107 normal
+skin WSIs and demonstrated how established indexes and search engines like
+Yottixel can be improved. We used 553 WSIs of cutaneous squamous cell carcinoma
+(cSCC) to show the advantage. We also validated our method applied to an
+external dataset of 451 breast WSIs. The number of selected WSI patches was
+reduced by 30% to 50% after utilizing the proposed normal atlas while
+maintaining the same indexing and search performance in leave-one-patinet-out
+validation for both datasets. We show that the proposed normal atlas shows
+promise for unsupervised selection of the most representative patches of the
+abnormal/malignant WSI lesions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Privacy-preserving Multi-biometric Indexing based on Frequent Binary
+  Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daile Osorio-Roig, Lazaro J. Gonzalez-Soler, Christian Rathgeb, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of large-scale identification systems that ensure the privacy
+protection of enrolled subjects represents a major challenge. Biometric
+deployments that provide interoperability and usability by including efficient
+multi-biometric solutions are a recent requirement. In the context of privacy
+protection, several template protection schemes have been proposed in the past.
+However, these schemes seem inadequate for indexing (workload reduction) in
+biometric identification systems. More specifically, they have been used in
+identification systems that perform exhaustive searches, leading to a
+degradation of computational efficiency. To overcome these limitations, we
+propose an efficient privacy-preserving multi-biometric identification system
+that retrieves protected deep cancelable templates and is agnostic with respect
+to biometric characteristics and biometric template protection schemes. To this
+end, a multi-biometric binning scheme is designed to exploit the low
+intra-class variation properties contained in the frequent binary patterns
+extracted from different types of biometric characteristics. Experimental
+results reported on publicly available databases using state-of-the-art Deep
+Neural Network (DNN)-based embedding extractors show that the protected
+multi-biometric identification system can reduce the computational workload to
+approximately 57\% (indexing up to three types of biometric characteristics)
+and 53% (indexing up to two types of biometric characteristics), while
+simultaneously improving the biometric performance of the baseline biometric
+system at the high-security thresholds. The source code of the proposed
+multi-biometric indexing approach together with the composed multi-biometric
+dataset, will be made available to the research community once the article is
+accepted.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D <span class="highlight-title">Pre-train</span>ed Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Tang, Eric Zhang, Ray Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of pre-trained large models has revolutionized downstream
+tasks across diverse fields, such as language, vision, and multi-modality. To
+minimize the adaption cost for downstream tasks, many Parameter-Efficient
+Fine-Tuning (PEFT) techniques are proposed for language and 2D image
+pre-trained models. However, the specialized PEFT method for 3D pre-trained
+models is still under-explored. To this end, we introduce Point-PEFT, a novel
+framework for adapting point cloud pre-trained models with minimal learnable
+parameters. Specifically, for a pre-trained 3D model, we freeze most of its
+parameters, and only tune the newly added PEFT modules on downstream tasks,
+which consist of a Point-prior Prompt and a Geometry-aware Adapter. The
+Point-prior Prompt adopts a set of learnable prompt tokens, for which we
+propose to construct a memory bank with domain-specific knowledge, and utilize
+a parameter-free attention to enhance the prompt tokens. The Geometry-aware
+Adapter aims to aggregate point cloud features within spatial neighborhoods to
+capture fine-grained geometric information through local interactions.
+Extensive experiments indicate that our Point-PEFT can achieve better
+performance than the full fine-tuning on various downstream tasks, while using
+only 5% of the trainable parameters, demonstrating the efficiency and
+effectiveness of our approach. Code will be released at
+https://github.com/EvenJoker/Point-PEFT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages. The specialized PEFT framework for 3D pre-trained models,
+  which achieves competitive performance to full fine-tuning, and significantly
+  reduces the computational resources. Project page:
+  https://github.com/EvenJoker/Point-PEFT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Adversarial Objectives for <span class="highlight-title">Self-Supervised</span> Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Zhang, Michael Maire
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the framework of generative adversarial networks (GANs), we propose
+objectives that task the discriminator for self-supervised representation
+learning via additional structural modeling responsibilities. In combination
+with an efficient smoothness regularizer imposed on the network, these
+objectives guide the discriminator to learn to extract informative
+representations, while maintaining a generator capable of sampling from the
+domain. Specifically, our objectives encourage the discriminator to structure
+features at two levels of granularity: aligning distribution characteristics,
+such as mean and variance, at coarse scales, and grouping features into local
+clusters at finer scales. Operating as a feature learner within the GAN
+framework frees our self-supervised system from the reliance on hand-crafted
+data augmentation schemes that are prevalent across contrastive representation
+learning methods. Across CIFAR-10/100 and an ImageNet subset, experiments
+demonstrate that equipping GANs with our self-supervised objectives suffices to
+produce discriminators which, evaluated in terms of representation learning,
+compete with networks trained by contrastive learning approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image-based Navigation in Real-World Environments via Multiple Mid-level
+  Representations: Fusion Models, Benchmark and Efficient Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.01069v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.01069v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Rosano, Antonino Furnari, Luigi Gulino, Corrado Santoro, Giovanni Maria Farinella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigating complex indoor environments requires a deep understanding of the
+space the robotic agent is acting into to correctly inform the navigation
+process of the agent towards the goal location. In recent learning-based
+navigation approaches, the scene understanding and navigation abilities of the
+agent are achieved simultaneously by collecting the required experience in
+simulation. Unfortunately, even if simulators represent an efficient tool to
+train navigation policies, the resulting models often fail when transferred
+into the real world. One possible solution is to provide the navigation model
+with mid-level visual representations containing important domain-invariant
+properties of the scene. But, what are the best representations that facilitate
+the transfer of a model to the real-world? How can they be combined? In this
+work we address these issues by proposing a benchmark of Deep Learning
+architectures to combine a range of mid-level visual representations, to
+perform a PointGoal navigation task following a Reinforcement Learning setup.
+All the proposed navigation models have been trained with the Habitat simulator
+on a synthetic office environment and have been tested on the same real-world
+environment using a real robotic platform. To efficiently assess their
+performance in a real context, a validation tool has been proposed to generate
+realistic navigation episodes inside the simulator. Our experiments showed that
+navigation models can benefit from the multi-modal input and that our
+validation tool can provide good estimation of the expected navigation
+performance in the real world, while saving time and resources. The acquired
+synthetic and real 3D models of the environment, together with the code of our
+validation tool built on top of Habitat, are publicly available at the
+following link: https://iplab.dmi.unict.it/EmbodiedVN/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for submission in Autonomous Robots</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Probabilistic Image-Text Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyuk Chun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,
+suffers from the inherent ambiguity arising from multiplicity and imperfect
+annotations. Deterministic functions are not sufficiently powerful to capture
+ambiguity, prompting the exploration of probabilistic embeddings to tackle the
+challenge. However, the existing probabilistic ITM approach encounters two key
+shortcomings; the burden of heavy computations due to the Monte Carlo
+approximation, and the loss saturation issue in the face of abundant false
+negatives. To overcome the issues, this paper presents an improved
+Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new
+probabilistic distance with a closed-form solution. In addition, two
+optimization techniques are proposed to enhance PCME++ further; first, the
+incorporation of pseudo-positives to prevent the loss saturation problem under
+massive false negatives; second, mixed sample data augmentation for
+probabilistic matching. Experimental results on MS-COCO Caption and two
+extended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of
+PCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is
+also evaluated under noisy image-text correspondences. In addition, the
+potential applicability of PCME++ in automatic prompt tuning for zero-shot
+classification is shown. The code is available at
+https://naver-ai.github.io/pcmepp/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/naver-ai/pcmepp. Project page:
+  https://naver-ai.github.io/pcmepp/. 26 pages, 1.2 MB</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CompoDiff: Versatile Composed Image Retrieval With Latent Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geonmo Gu, Sanghyuk Chun, Wonjae Kim, HeeJae Jun, Yoohoon Kang, Sangdoo Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel diffusion-based model, CompoDiff, for solving
+Composed Image Retrieval (CIR) with latent diffusion and presents a newly
+created dataset, named SynthTriplets18M, of 18 million reference images,
+conditions, and corresponding target image triplets to train the model.
+CompoDiff and SynthTriplets18M tackle the shortages of the previous CIR
+approaches, such as poor generalizability due to the small dataset scale and
+the limited types of conditions. CompoDiff not only achieves a new zero-shot
+state-of-the-art on four CIR benchmarks, including FashionIQ, CIRR, CIRCO, and
+GeneCIS, but also enables a more versatile and controllable CIR by accepting
+various conditions, such as negative text and image mask conditions, and the
+controllability to the importance between multiple queries or the trade-off
+between inference speed and the performance which are unavailable with existing
+CIR methods. The code and dataset are available at
+https://github.com/navervision/CompoDiff
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally; 26 pages, 4.1MB</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robustified ANNs Reveal Wormholes Between Human Category Percepts <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06887v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06887v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Gaziv, Michael J. Lee, James J. DiCarlo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual object category reports of artificial neural networks (ANNs) are
+notoriously sensitive to tiny, adversarial image perturbations. Because human
+category reports (aka human percepts) are thought to be insensitive to those
+same small-norm perturbations -- and locally stable in general -- this argues
+that ANNs are incomplete scientific models of human visual perception.
+Consistent with this, we show that when small-norm image perturbations are
+generated by standard ANN models, human object category percepts are indeed
+highly stable. However, in this very same "human-presumed-stable" regime, we
+find that robustified ANNs reliably discover low-norm image perturbations that
+strongly disrupt human percepts. These previously undetectable human perceptual
+disruptions are massive in amplitude, approaching the same level of sensitivity
+seen in robustified ANNs. Further, we show that robustified ANNs support
+precise perceptual state interventions: they guide the construction of low-norm
+image perturbations that strongly alter human category percepts toward specific
+prescribed percepts. These observations suggest that for arbitrary starting
+points in image space, there exists a set of nearby "wormholes", each leading
+the subject from their current category perceptual state into a semantically
+very different state. Moreover, contemporary ANN models of biological visual
+processing are now accurate enough to consistently guide us to those portals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In NeurIPS 2023. Code: https://github.com/ggaziv/Wormholes Project
+  Webpage: https://himjl.github.io/pwormholes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FG-NeRF: Flow-GAN based Probabilistic Neural Radiance Field for
+  Independence-Assumption-Free Uncertainty Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Wei, Jiazhao Zhang, Yang Wang, Fanbo Xiang, Hao Su, He Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields with stochasticity have garnered significant interest
+by enabling the sampling of plausible radiance fields and quantifying
+uncertainty for downstream tasks. Existing works rely on the independence
+assumption of points in the radiance field or the pixels in input views to
+obtain tractable forms of the probability density function. However, this
+assumption inadvertently impacts performance when dealing with intricate
+geometry and texture. In this work, we propose an independence-assumption-free
+probabilistic neural radiance field based on Flow-GAN. By combining the
+generative capability of adversarial learning and the powerful expressivity of
+normalizing flow, our method explicitly models the density-radiance
+distribution of the whole scene. We represent our probabilistic NeRF as a
+mean-shifted probabilistic residual neural model. Our model is trained without
+an explicit likelihood function, thereby avoiding the independence assumption.
+Specifically, We downsample the training images with different strides and
+centers to form fixed-size patches which are used to train the generator with
+patch-based adversarial learning. Through extensive experiments, our method
+demonstrates state-of-the-art performance by predicting lower rendering errors
+and more reliable uncertainty on both synthetic and real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MUSTANG: Multi-Stain Self-Attention Graph Multiple Instance Learning
+  Pipeline for Histopathology Whole Slide Images <span class="chip">BMVC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10650v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10650v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amaya Gallagher-Syed, Luca Rossi, Felice Rivellese, Costantino Pitzalis, Myles Lewis, Michael Barnes, Gregory Slabaugh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole Slide Images (WSIs) present a challenging computer vision task due to
+their gigapixel size and presence of numerous artefacts. Yet they are a
+valuable resource for patient diagnosis and stratification, often representing
+the gold standard for diagnostic tasks. Real-world clinical datasets tend to
+come as sets of heterogeneous WSIs with labels present at the patient-level,
+with poor to no annotations. Weakly supervised attention-based multiple
+instance learning approaches have been developed in recent years to address
+these challenges, but can fail to resolve both long and short-range
+dependencies. Here we propose an end-to-end multi-stain self-attention graph
+(MUSTANG) multiple instance learning pipeline, which is designed to solve a
+weakly-supervised gigapixel multi-image classification task, where the label is
+assigned at the patient-level, but no slide-level labels or region annotations
+are available. The pipeline uses a self-attention based approach by restricting
+the operations to a highly sparse k-Nearest Neighbour Graph of embedded WSI
+patches based on the Euclidean distance. We show this approach achieves a
+state-of-the-art F1-score/AUC of 0.89/0.92, outperforming the widely used CLAM
+model. Our approach is highly modular and can easily be modified to suit
+different clinical datasets, as it only requires a patient-level label without
+annotations and accepts WSI sets of different sizes, as the graphs can be of
+varying sizes and structures. The source code can be found at
+https://github.com/AmayaGS/MUSTANG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at BMVC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expanding Small-Scale <span class="highlight-title">Dataset</span>s with Guided Imagination <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13976v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13976v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Daquan Zhou, Bryan Hooi, Kai Wang, Jiashi Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The power of DNNs relies heavily on the quantity and quality of training
+data. However, collecting and annotating data on a large scale is often
+expensive and time-consuming. To address this issue, we explore a new task,
+termed dataset expansion, aimed at expanding a ready-to-use small dataset by
+automatically creating new labeled samples. To this end, we present a Guided
+Imagination Framework (GIF) that leverages cutting-edge generative models like
+DALL-E2 and Stable Diffusion (SD) to "imagine" and create informative new data
+from the input seed data. Specifically, GIF conducts data imagination by
+optimizing the latent features of the seed data in the semantically meaningful
+space of the prior model, resulting in the creation of photo-realistic images
+with new content. To guide the imagination towards creating informative samples
+for model training, we introduce two key criteria, i.e., class-maintained
+information boosting and sample diversity promotion. These criteria are
+verified to be essential for effective dataset expansion: GIF-SD obtains 13.5%
+higher model accuracy on natural image datasets than unguided expansion with
+SD. With these essential criteria, GIF successfully expands small datasets in
+various scenarios, boosting model accuracy by 36.9% on average over six natural
+image datasets and by 13.5% on average over three medical datasets. The source
+code is available at https://github.com/Vanint/DatasetExpansion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Anisotropic Gaussian Filters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Keilmann, Michael Godehardt, Ali Moghiseh, Claudia Redenbach, Katja Schladitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Elongated anisotropic Gaussian filters are used for the orientation
+estimation of fibers. In cases where computed tomography images are noisy,
+roughly resolved, and of low contrast, they are the method of choice even if
+being efficient only in virtual 2D slices. However, minor inaccuracies in the
+anisotropic Gaussian filters can carry over to the orientation estimation.
+Therefore, this paper proposes a modified algorithm for 2D anisotropic Gaussian
+filters and shows that this improves their precision. Applied to synthetic
+images of fiber bundles, it is more accurate and robust to noise. Finally, the
+effectiveness of the approach is shown by applying it to real-world images of
+sheet molding compounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trimap-guided Feature Mining and Fusion Network for Natural Image
+  Matting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.00510v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.00510v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Jiang, Dongdong Yu, Zhaozhi Xie, Yaoyi Li, Zehuan Yuan, Hongtao Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Utilizing trimap guidance and fusing multi-level features are two important
+issues for trimap-based matting with pixel-level prediction. To utilize trimap
+guidance, most existing approaches simply concatenate trimaps and images
+together to feed a deep network or apply an extra network to extract more
+trimap guidance, which meets the conflict between efficiency and effectiveness.
+For emerging content-based feature fusion, most existing matting methods only
+focus on local features which lack the guidance of a global feature with strong
+semantic information related to the interesting object. In this paper, we
+propose a trimap-guided feature mining and fusion network consisting of our
+trimap-guided non-background multi-scale pooling (TMP) module and global-local
+context-aware fusion (GLF) modules. Considering that trimap provides strong
+semantic guidance, our TMP module focuses effective feature mining on
+interesting objects under the guidance of trimap without extra parameters.
+Furthermore, our GLF modules use global semantic information of interesting
+objects mined by our TMP module to guide an effective global-local
+context-aware multi-level feature fusion. In addition, we build a common
+interesting object matting (CIOM) dataset to advance high-quality image
+matting. Particularly, results on the Composition-1k and our CIOM show that our
+TMFNet achieves 13% and 25% relative improvement on SAD, respectively, against
+a strong baseline with fewer parameters and 14% fewer FLOPs. Experimental
+results on the Composition-1k test set, Alphamatting benchmark, and our CIOM
+test set demonstrate that our method outperforms state-of-the-art approaches.
+Our code and models are available at
+https://github.com/Serge-weihao/TMF-Matting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Computer Vision and Image Understanding</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GAMMA: Generalizable Articulation Modeling and Manipulation for
+  Articulated Objects <span class="chip">ICRA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaojun Yu, Junbo Wang, Wenhai Liu, Ce Hao, Liu Liu, Lin Shao, Weiming Wang, Cewu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Articulated objects like cabinets and doors are widespread in daily life.
+However, directly manipulating 3D articulated objects is challenging because
+they have diverse geometrical shapes, semantic categories, and kinetic
+constraints. Prior works mostly focused on recognizing and manipulating
+articulated objects with specific joint types. They can either estimate the
+joint parameters or distinguish suitable grasp poses to facilitate trajectory
+planning. Although these approaches have succeeded in certain types of
+articulated objects, they lack generalizability to unseen objects, which
+significantly impedes their application in broader scenarios. In this paper, we
+propose a novel framework of Generalizable Articulation Modeling and
+Manipulating for Articulated Objects (GAMMA), which learns both articulation
+modeling and grasp pose affordance from diverse articulated objects with
+different categories. In addition, GAMMA adopts adaptive manipulation to
+iteratively reduce the modeling errors and enhance manipulation performance. We
+train GAMMA with the PartNet-Mobility dataset and evaluate with comprehensive
+experiments in SAPIEN simulation and real-world Franka robot. Results show that
+GAMMA significantly outperforms SOTA articulation modeling and manipulation
+algorithms in unseen and cross-category articulated objects. We will
+open-source all codes and datasets in both simulation and real robots for
+reproduction in the final version. Images and videos are published on the
+project website at: http://sites.google.com/view/gamma-articulation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, submitted to ICRA 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-to-Motion Retrieval: Towards Joint Understanding of Human Motion
+  Data and Natural Language <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Messina, Jan Sedmidubsky, Fabrizio Falchi, Tomáš Rebok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to recent advances in pose-estimation methods, human motion can be
+extracted from a common video in the form of 3D skeleton sequences. Despite
+wonderful application opportunities, effective and efficient content-based
+access to large volumes of such spatio-temporal skeleton data still remains a
+challenging problem. In this paper, we propose a novel content-based
+text-to-motion retrieval task, which aims at retrieving relevant motions based
+on a specified natural-language textual description. To define baselines for
+this uncharted task, we employ the BERT and CLIP language representations to
+encode the text modality and successful spatio-temporal models to encode the
+motion modality. We additionally introduce our transformer-based approach,
+called Motion Transformer (MoT), which employs divided space-time attention to
+effectively aggregate the different skeleton joints in space and time. Inspired
+by the recent progress in text-to-image/video matching, we experiment with two
+widely-adopted metric-learning loss functions. Finally, we set up a common
+evaluation protocol by defining qualitative metrics for assessing the quality
+of the retrieved motions, targeting the two recently-introduced KIT
+Motion-Language and HumanML3D datasets. The code for reproducing our results is
+available at https://github.com/mesnico/text-to-motion-retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR 2023 (best short paper honorable mention)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking superpixel segmentation from biologically inspired mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        TingYu Zhao, Bo Peng, Yuan Sun, DaiPeng Yang, ZhenGuang Zhange, Xi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, advancements in deep learning-based superpixel segmentation methods
+have brought about improvements in both the efficiency and the performance of
+segmentation. However, a significant challenge remains in generating
+superpixels that strictly adhere to object boundaries while conveying rich
+visual significance, especially when cross-surface color correlations may
+interfere with objects. Drawing inspiration from neural structure and visual
+mechanisms, we propose a biological network architecture comprising an Enhanced
+Screening Module (ESM) and a novel Boundary-Aware Label (BAL) for superpixel
+segmentation. The ESM enhances semantic information by simulating the
+interactive projection mechanisms of the visual cortex. Additionally, the BAL
+emulates the spatial frequency characteristics of visual cortical cells to
+facilitate the generation of superpixels with strong boundary adherence. We
+demonstrate the effectiveness of our approach through evaluations on both the
+BSDS500 dataset and the NYUv2 dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLOps for Scarce Image Data: A Use Case in Microscopic Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15521v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15521v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelo Yamachui Sitcheu, Nils Friederich, Simon Baeuerle, Oliver Neumann, Markus Reischl, Ralf Mikut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, Machine Learning (ML) is experiencing tremendous popularity that
+has never been seen before. The operationalization of ML models is governed by
+a set of concepts and methods referred to as Machine Learning Operations
+(MLOps). Nevertheless, researchers, as well as professionals, often focus more
+on the automation aspect and neglect the continuous deployment and monitoring
+aspects of MLOps. As a result, there is a lack of continuous learning through
+the flow of feedback from production to development, causing unexpected model
+deterioration over time due to concept drifts, particularly when dealing with
+scarce data. This work explores the complete application of MLOps in the
+context of scarce data analysis. The paper proposes a new holistic approach to
+enhance biomedical image analysis. Our method includes: a fingerprinting
+process that enables selecting the best models, datasets, and model development
+strategy relative to the image analysis task at hand; an automated model
+development stage; and a continuous deployment and monitoring process to ensure
+continuous learning. For preliminary results, we perform a proof of concept for
+fingerprinting in microscopic image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 5 figures , 33. Workshop on Computational Intelligence
+  Berlin Germany</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoPET Challenge 2023: Sliding Window-based Optimization of U-Net <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12114v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12114v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Hadlich, Zdravko Marinov, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tumor segmentation in medical imaging is crucial and relies on precise
+delineation. Fluorodeoxyglucose Positron-Emission Tomography (FDG-PET) is
+widely used in clinical practice to detect metabolically active tumors.
+However, FDG-PET scans may misinterpret irregular glucose consumption in
+healthy or benign tissues as cancer. Combining PET with Computed Tomography
+(CT) can enhance tumor segmentation by integrating metabolic and anatomic
+information. FDG-PET/CT scans are pivotal for cancer staging and reassessment,
+utilizing radiolabeled fluorodeoxyglucose to highlight metabolically active
+regions. Accurately distinguishing tumor-specific uptake from physiological
+uptake in normal tissues is a challenging aspect of precise tumor segmentation.
+The AutoPET challenge addresses this by providing a dataset of 1014 FDG-PET/CT
+studies, encouraging advancements in accurate tumor segmentation and analysis
+within the FDG-PET/CT domain. Code:
+https://github.com/matt3o/AutoPET2-Submission/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 1 figure, MICCAI 2023 - AutoPET Challenge Submission Version
+  2: Added all results on the preliminary test set</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Concept-Based System for Local, Global, and Misclassification
+  Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Aghaeipoor, Dorsa Asgarian, Mohammad Sabokrou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainability of Deep Neural Networks (DNNs) has been garnering increasing
+attention in recent years. Of the various explainability approaches,
+concept-based techniques stand out for their ability to utilize
+human-meaningful concepts instead of focusing solely on individual pixels.
+However, there is a scarcity of methods that consistently provide both local
+and global explanations. Moreover, most of the methods have no offer to explain
+misclassification cases. Considering these challenges, we present a unified
+concept-based system for unsupervised learning of both local and global
+concepts. Our primary objective is to uncover the intrinsic concepts underlying
+each data category by training surrogate explainer networks to estimate the
+importance of the concepts. Our experimental results substantiated the efficacy
+of the discovered concepts through diverse quantitative and qualitative
+assessments, encompassing faithfulness, completeness, and generality.
+Furthermore, our approach facilitates the explanation of both accurate and
+erroneous predictions, rendering it a valuable tool for comprehending the
+characteristics of the target objects and classes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-Visual <span class="highlight-title">Prompt</span>ing for Efficient 2D Temporal Video Grounding <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04995v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04995v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of temporal video grounding (TVG), which
+aims to predict the starting/ending time points of moments described by a text
+sentence within a long untrimmed video. Benefiting from fine-grained 3D visual
+features, the TVG techniques have achieved remarkable progress in recent years.
+However, the high complexity of 3D convolutional neural networks (CNNs) makes
+extracting dense 3D visual features time-consuming, which calls for intensive
+memory and computing resources. Towards efficient TVG, we propose a novel
+text-visual prompting (TVP) framework, which incorporates optimized
+perturbation patterns (that we call 'prompts') into both visual inputs and
+textual features of a TVG model. In sharp contrast to 3D CNNs, we show that TVP
+allows us to effectively co-train vision encoder and language encoder in a 2D
+TVG model and improves the performance of crossmodal feature fusion using only
+low-complexity sparse 2D visual features. Further, we propose a
+Temporal-Distance IoU (TDIoU) loss for efficient learning of TVG. Experiments
+on two benchmark datasets, Charades-STA and ActivityNet Captions datasets,
+empirically show that the proposed TVP significantly boosts the performance of
+2D TVG (e.g., 9.79% improvement on Charades-STA and 30.77% improvement on
+ActivityNet Captions) and achieves 5x inference acceleration over TVG using 3D
+visual features. Codes are available at Open.Intel.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the CVPR 2023 and code released
+  (https://github.com/intel/TVP)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoGraph: Predicting Lane Graphs from Traffic Observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15410v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15410v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Zürn, Ingmar Posner, Wolfram Burgard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane graph estimation is a long-standing problem in the context of autonomous
+driving. Previous works aimed at solving this problem by relying on
+large-scale, hand-annotated lane graphs, introducing a data bottleneck for
+training models to solve this task. To overcome this limitation, we propose to
+use the motion patterns of traffic participants as lane graph annotations. In
+our AutoGraph approach, we employ a pre-trained object tracker to collect the
+tracklets of traffic participants such as vehicles and trucks. Based on the
+location of these tracklets, we predict the successor lane graph from an
+initial position using overhead RGB images only, not requiring any human
+supervision. In a subsequent stage, we show how the individual successor
+predictions can be aggregated into a consistent lane graph. We demonstrate the
+efficacy of our approach on the UrbanLaneGraph dataset and perform extensive
+quantitative and qualitative evaluations, indicating that AutoGraph is on par
+with models trained on hand-annotated graph data. Model and dataset will be
+made available at redacted-for-review.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning the Geodesic Embedding with Graph Neural Networks <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05613v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05613v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Pang, Zhongtian Zheng, Guoping Wang, Peng-Shuai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GeGnn, a learning-based method for computing the approximate
+geodesic distance between two arbitrary points on discrete polyhedra surfaces
+with constant time complexity after fast precomputation. Previous relevant
+methods either focus on computing the geodesic distance between a single source
+and all destinations, which has linear complexity at least or require a long
+precomputation time. Our key idea is to train a graph neural network to embed
+an input mesh into a high-dimensional embedding space and compute the geodesic
+distance between a pair of points using the corresponding embedding vectors and
+a lightweight decoding function. To facilitate the learning of the embedding,
+we propose novel graph convolution and graph pooling modules that incorporate
+local geodesic information and are verified to be much more effective than
+previous designs. After training, our method requires only one forward pass of
+the network per mesh as precomputation. Then, we can compute the geodesic
+distance between a pair of points using our decoding function, which requires
+only several matrix multiplications and can be massively parallelized on GPUs.
+We verify the efficiency and effectiveness of our method on ShapeNet and
+demonstrate that our method is faster than existing methods by orders of
+magnitude while achieving comparable or better accuracy. Additionally, our
+method exhibits robustness on noisy and incomplete meshes and strong
+generalization ability on out-of-distribution meshes. The code and pretrained
+model can be found on https://github.com/IntelligentGeometry/GeGnn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023, Journal Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MS-PS: A Multi-Scale Network for Photometric Stereo With a New
+  Comprehensive Training <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clément Hardy, Yvain Quéau, David Tschumperlé
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The photometric stereo (PS) problem consists in reconstructing the 3D-surface
+of an object, thanks to a set of photographs taken under different lighting
+directions. In this paper, we propose a multi-scale architecture for PS which,
+combined with a new dataset, yields state-of-the-art results. Our proposed
+architecture is flexible: it permits to consider a variable number of images as
+well as variable image size without loss of performance. In addition, we define
+a set of constraints to allow the generation of a relevant synthetic dataset to
+train convolutional neural networks for the PS problem. Our proposed dataset is
+much larger than pre-existing ones, and contains many objects with challenging
+materials having anisotropic reflectance (e.g. metals, glass). We show on
+publicly available benchmarks that the combination of both these contributions
+drastically improves the accuracy of the estimated normal field, in comparison
+with previous state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preemptively Pruning Clever-Hans Strategies in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenz Linhardt, Klaus-Robert Müller, Grégoire Montavon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI has become a popular tool for validating machine learning
+models. Mismatches between the explained model's decision strategy and the
+user's domain knowledge (e.g. Clever Hans effects) have also been recognized as
+a starting point for improving faulty models. However, it is less clear what to
+do when the user and the explanation agree. In this paper, we demonstrate that
+acceptance of explanations by the user is not a guarantee for a machine
+learning model to function well, in particular, some Clever Hans effects may
+remain undetected. Such hidden flaws of the model can nevertheless be
+mitigated, and we demonstrate this by contributing a new method,
+Explanation-Guided Exposure Minimization (EGEM), that preemptively prunes
+variations in the ML model that have not been the subject of positive
+explanation feedback. Experiments on natural image data demonstrate that our
+approach leads to models that strongly reduce their reliance on hidden Clever
+Hans strategies, and consequently achieve higher accuracy on new data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages + supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06991v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06991v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zike Wu, Pan Zhou, Kenji Kawaguchi, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) have been adopted across diverse fields with its
+remarkable abilities in capturing intricate data distributions. In this paper,
+we propose a Fast Diffusion Model (FDM) to significantly speed up DMs from a
+stochastic optimization perspective for both faster training and sampling. We
+first find that the diffusion process of DMs accords with the stochastic
+optimization process of stochastic gradient descent (SGD) on a stochastic
+time-variant problem. Then, inspired by momentum SGD that uses both gradient
+and an extra momentum to achieve faster and more stable convergence than SGD,
+we integrate momentum into the diffusion process of DMs. This comes with a
+unique challenge of deriving the noise perturbation kernel from the
+momentum-based diffusion process. To this end, we frame the process as a Damped
+Oscillation system whose critically damped state -- the kernel solution --
+avoids oscillation and yields a faster convergence speed of the diffusion
+process. Empirical results show that our FDM can be applied to several popular
+DM frameworks, e.g., VP, VE, and EDM, and reduces their training cost by about
+50% with comparable image synthesis performance on CIFAR-10, FFHQ, and AFHQv2
+datasets. Moreover, FDM decreases their sampling steps by about 3x to achieve
+similar performance under the same samplers. The code is available at
+https://github.com/sail-sg/FDM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Guided Attention for Next Active Object @ EGO4D STA Challenge <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16066v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16066v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanket Thakur, Cigdem Beyan, Pietro Morerio, Vittorio Murino, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this technical report, we describe the Guided-Attention mechanism based
+solution for the short-term anticipation (STA) challenge for the EGO4D
+challenge. It combines the object detections, and the spatiotemporal features
+extracted from video clips, enhancing the motion and contextual information,
+and further decoding the object-centric and motion-centric information to
+address the problem of STA in egocentric videos. For the challenge, we build
+our model on top of StillFast with Guided Attention applied on fast network.
+Our model obtains better performance on the validation set and also achieves
+state-of-the-art (SOTA) results on the challenge test set for EGO4D Short-Term
+Object Interaction Anticipation Challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Winner of CVPR@2023 Ego4D STA challenge. arXiv admin note:
+  substantial text overlap with arXiv:2305.12953</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning of Contextualized Local Visual Embeddings <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00527v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00527v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thalles Santos Silva, Helio Pedrini, Adín Ramírez Rivera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Contextualized Local Visual Embeddings (CLoVE), a self-supervised
+convolutional-based method that learns representations suited for dense
+prediction tasks. CLoVE deviates from current methods and optimizes a single
+loss function that operates at the level of contextualized local embeddings
+learned from output feature maps of convolution neural network (CNN) encoders.
+To learn contextualized embeddings, CLoVE proposes a normalized mult-head
+self-attention layer that combines local features from different parts of an
+image based on similarity. We extensively benchmark CLoVE's pre-trained
+representations on multiple datasets. CLoVE reaches state-of-the-art
+performance for CNN-based architectures in 4 dense prediction downstream tasks,
+including object detection, instance segmentation, keypoint detection, and
+dense pose estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print. 4th Visual Inductive Priors for Data-Efficient Deep
+  Learning Workshop ICCV 2023. Code at https://github.com/sthalles/CLoVE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Next-Active Objects for Context-Aware Anticipation in
+  Egocentric Videos <span class="chip">WACV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanket Thakur, Cigdem Beyan, Pietro Morerio, Vittorio Murino, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objects are crucial for understanding human-object interactions. By
+identifying the relevant objects, one can also predict potential future
+interactions or actions that may occur with these objects. In this paper, we
+study the problem of Short-Term Object interaction anticipation (STA) and
+propose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a
+multi-modal end-to-end transformer network, that attends to objects in observed
+frames in order to anticipate the next-active-object (NAO) and, eventually, to
+guide the model to predict context-aware future actions. The task is
+challenging since it requires anticipating future action along with the object
+with which the action occurs and the time after which the interaction will
+begin, a.k.a. the time to contact (TTC). Compared to existing video modeling
+architectures for action anticipation, NAOGAT captures the relationship between
+objects and the global scene context in order to predict detections for the
+next active object and anticipate relevant future actions given these
+detections, leveraging the objects' dynamics to improve accuracy. One of the
+key strengths of our approach, in fact, is its ability to exploit the motion
+dynamics of objects within a given clip, which is often ignored by other
+models, and separately decoding the object-centric and motion-centric
+information. Through our experiments, we show that our model outperforms
+existing methods on two separate datasets, Ego4D and EpicKitchens-100 ("Unseen
+Set"), as measured by several additional metrics, such as time to contact, and
+next-active-object localization. The code will be available upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in WACV'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-Located Human-Human Interaction Analysis using Nonverbal Cues: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cigdem Beyan, Alessandro Vinciarelli, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated co-located human-human interaction analysis has been addressed by
+the use of nonverbal communication as measurable evidence of social and
+psychological phenomena. We survey the computing studies (since 2010) detecting
+phenomena related to social traits (e.g., leadership, dominance, personality
+traits), social roles/relations, and interaction dynamics (e.g., group
+cohesion, engagement, rapport). Our target is to identify the nonverbal cues
+and computational methodologies resulting in effective performance. This survey
+differs from its counterparts by involving the widest spectrum of social
+phenomena and interaction settings (free-standing conversations, meetings,
+dyads, and crowds). We also present a comprehensive summary of the related
+datasets and outline future research directions which are regarding the
+implementation of artificial intelligence, dataset curation, and
+privacy-preserving interaction analysis. Some major observations are: the most
+often used nonverbal cue, computational method, interaction environment, and
+sensing approach are speaking activity, support vector machines, and meetings
+composed of 3-4 persons equipped with microphones and cameras, respectively;
+multimodal features are prominently performing better; deep learning
+architectures showed improved performance in overall, but there exist many
+phenomena whose detection has never been implemented through deep models. We
+also identified several limitations such as the lack of scalable benchmarks,
+annotation reliability tests, cross-dataset experiments, and explainability
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. It is posted here for your
+  personal use. Not for redistribution. The definitive version was published in
+  ACM Computing Surveys, https://doi.org/10.1145/3626516</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Break-A-Scene: Extracting Multiple Concepts from a Single Image <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16311v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16311v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Avrahami, Kfir Aberman, Ohad Fried, Daniel Cohen-Or, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image model personalization aims to introduce a user-provided concept
+to the model, allowing its synthesis in diverse contexts. However, current
+methods primarily focus on the case of learning a single concept from multiple
+images with variations in backgrounds and poses, and struggle when adapted to a
+different scenario. In this work, we introduce the task of textual scene
+decomposition: given a single image of a scene that may contain several
+concepts, we aim to extract a distinct text token for each concept, enabling
+fine-grained control over the generated scenes. To this end, we propose
+augmenting the input image with masks that indicate the presence of target
+concepts. These masks can be provided by the user or generated automatically by
+a pre-trained segmentation model. We then present a novel two-phase
+customization process that optimizes a set of dedicated textual embeddings
+(handles), as well as the model weights, striking a delicate balance between
+accurately capturing the concepts and avoiding overfitting. We employ a masked
+diffusion loss to enable handles to generate their assigned concepts,
+complemented by a novel loss on cross-attention maps to prevent entanglement.
+We also introduce union-sampling, a training strategy aimed to improve the
+ability of combining multiple concepts in generated images. We use several
+automatic metrics to quantitatively compare our method against several
+baselines, and further affirm the results using a user study. Finally, we
+showcase several applications of our method. Project page is available at:
+https://omriavrahami.com/break-a-scene/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023. Project page: at:
+  https://omriavrahami.com/break-a-scene/ Video:
+  https://www.youtube.com/watch?v=-9EA-BhizgM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Foundation Model for General Moving Object Segmentation in Medical
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongnuo Yan, Tong Han, Yuhao Huang, Lian Liu, Han Zhou, Jiongquan Chen, Wenlong Shi, Yan Cao, Xin Yang, Dong Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation aims to delineate the anatomical or pathological
+structures of interest, playing a crucial role in clinical diagnosis. A
+substantial amount of high-quality annotated data is crucial for constructing
+high-precision deep segmentation models. However, medical annotation is highly
+cumbersome and time-consuming, especially for medical videos or 3D volumes, due
+to the huge labeling space and poor inter-frame consistency. Recently, a
+fundamental task named Moving Object Segmentation (MOS) has made significant
+advancements in natural images. Its objective is to delineate moving objects
+from the background within image sequences, requiring only minimal annotations.
+In this paper, we propose the first foundation model, named iMOS, for MOS in
+medical images. Extensive experiments on a large multi-modal medical dataset
+validate the effectiveness of the proposed iMOS. Specifically, with the
+annotation of only a small number of images in the sequence, iMOS can achieve
+satisfactory tracking and segmentation performance of moving objects throughout
+the entire sequence in bi-directions. We hope that the proposed iMOS can help
+accelerate the annotation speed of experts, and boost the development of
+medical foundation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Modeling through the Semi-dual Formulation of Unbalanced
+  Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemoo Choi, Jaewoong Choi, Myungjoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) problem investigates a transport map that bridges two
+distributions while minimizing a given cost function. In this regard, OT
+between tractable prior distribution and data has been utilized for generative
+modeling tasks. However, OT-based methods are susceptible to outliers and face
+optimization challenges during training. In this paper, we propose a novel
+generative model based on the semi-dual formulation of Unbalanced Optimal
+Transport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution
+matching. This approach provides better robustness against outliers, stability
+during training, and faster convergence. We validate these properties
+empirically through experiments. Moreover, we study the theoretical upper-bound
+of divergence between distributions in UOT. Our model outperforms existing
+OT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 5.80
+on CelebA-HQ-256. The code is available at
+\url{https://github.com/Jae-Moo/UOTM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Med-Tuning: Parameter-Efficient Transfer Learning with Fine-Grained
+  Feature Enhancement for Medical Volumetric Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Wang, Jiachen Shen, Chen Chen, Jianbo Jiao, Jing Liu, Yan Zhang, Shanshan Song, Jiangyun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based medical volumetric segmentation methods either train the
+model from scratch or follow the standard "pre-training then fine-tuning"
+paradigm. Although fine-tuning a pre-trained model on downstream tasks can
+harness its representation power, the standard full fine-tuning is costly in
+terms of computation and memory footprint. In this paper, we present the study
+on parameter-efficient transfer learning for medical volumetric segmentation
+and propose a new framework named Med-Tuning based on intra-stage feature
+enhancement and inter-stage feature interaction. Additionally, aiming at
+exploiting the intrinsic global properties of Fourier Transform for
+parameter-efficient transfer learning, a new adapter block namely Med-Adapter
+with a well-designed Fourier Transform branch is proposed for effectively and
+efficiently modeling the crucial global context for medical volumetric
+segmentation. Given a large-scale pre-trained model on 2D natural images, our
+method can exploit both the crucial spatial multi-scale feature and volumetric
+correlations along slices for accurate segmentation. Extensive experiments on
+three benchmark datasets (including CT and MRI) show that our method can
+achieve better results than previous parameter-efficient transfer learning
+methods on segmentation tasks, with much less tuned parameter costs. Compared
+to full fine-tuning, our method reduces the finetuned model parameters by up to
+4x, with even better segmentation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LanguageBind: Extending Video-Language <span class="highlight-title">Pretrain</span>ing to N-modality by
+  Language-based Semantic Alignment <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Wancai Zhang, Zhifeng Li, Wei Liu, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The video-language (VL) pretraining has achieved remarkable improvement in
+multiple downstream tasks. However, the current VL pretraining framework is
+hard to extend to multiple modalities (N modalities, N>=3) beyond vision and
+language. We thus propose LanguageBind, taking the language as the bind across
+different modalities because the language modality is well-explored and
+contains rich semantics. Specifically, we freeze the language encoder acquired
+by VL pretraining, then train encoders for other modalities with contrastive
+learning. As a result, all modalities are mapped to a shared feature space,
+implementing multi-modal semantic alignment. While LanguageBind ensures that we
+can extend VL modalities to N modalities, we also need a high-quality dataset
+with alignment data pairs centered on language. We thus propose VIDAL-10M with
+Video, Infrared, Depth, Audio and their corresponding Language, naming as
+VIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with
+complete semantics rather than truncated segments from long videos, and all the
+video, depth, infrared, and audio modalities are aligned to their textual
+descriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 1.2%
+R@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot
+video-text retrieval, validating the high quality of our dataset. Beyond this,
+our LanguageBind has achieved great improvement in the zero-shot video, audio,
+depth, and infrared understanding tasks. For instance, on the LLVIP and NYU-D
+datasets, LanguageBind outperforms ImageBind-huge with 23.8% and 11.1% top-1
+accuracy. Code address: https://github.com/PKU-YuanGroup/LanguageBind.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review as a conference paper at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Object-Centric Neural Scattering Functions for Free-Viewpoint
+  Relighting and Scene Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06138v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06138v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong-Xing Yu, Michelle Guo, Alireza Fathi, Yen-Yu Chang, Eric Ryan Chan, Ruohan Gao, Thomas Funkhouser, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photorealistic object appearance modeling from 2D images is a constant topic
+in vision and graphics. While neural implicit methods (such as Neural Radiance
+Fields) have shown high-fidelity view synthesis results, they cannot relight
+the captured objects. More recent neural inverse rendering approaches have
+enabled object relighting, but they represent surface properties as simple
+BRDFs, and therefore cannot handle translucent objects. We propose
+Object-Centric Neural Scattering Functions (OSFs) for learning to reconstruct
+object appearance from only images. OSFs not only support free-viewpoint object
+relighting, but also can model both opaque and translucent objects. While
+accurately modeling subsurface light transport for translucent objects can be
+highly complex and even intractable for neural methods, OSFs learn to
+approximate the radiance transfer from a distant light to an outgoing direction
+at any spatial location. This approximation avoids explicitly modeling complex
+subsurface scattering, making learning a neural implicit model tractable.
+Experiments on real and synthetic data show that OSFs accurately reconstruct
+appearances for both opaque and translucent objects, allowing faithful
+free-viewpoint relighting as well as scene composition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal extension of arXiv:2012.08503 (TMLR 2023). The first two
+  authors contributed equally to this work. Project page:
+  https://kovenyu.com/osf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective and Parameter-Efficient Reusing Fine-Tuned Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01886v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01886v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisen Jiang, Baijiong Lin, Han Shi, Yu Zhang, Zhenguo Li, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many pre-trained large-scale models provided online have become highly
+effective in transferring to downstream tasks. At the same time, various
+task-specific models fine-tuned on these pre-trained models are available
+online for public use. In practice, as collecting task-specific data is
+labor-intensive and fine-tuning the large pre-trained models is computationally
+expensive, one can reuse task-specific finetuned models to deal with downstream
+tasks. However, using a model per task causes a heavy burden on storage and
+serving. Recently, many training-free and parameter-efficient methods have been
+proposed for reusing multiple fine-tuned task-specific models into a single
+multi-task model. However, these methods exhibit a large accuracy gap compared
+with using a fine-tuned model per task. In this paper, we propose
+Parameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing
+Fully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task
+vector into a merged model by magnitude pruning. For reusing LoRA fine-tuned
+models, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA
+matrix by singular value decomposition. Both PERUFFT and PERU-LoRA are
+training-free. Extensive experiments conducted on computer vision and natural
+language process tasks demonstrate the effectiveness and parameter-efficiency
+of the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform
+existing reusing model methods by a large margin and achieve comparable
+performance to using a fine-tuned model per task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning in Open-vocabulary Classification with Complementary
+  Memory Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01430v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01430v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Zhu, Weijie Lyu, Yao Xiao, Derek Hoiem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method for flexible and efficient continual learning in
+open-vocabulary image classification, drawing inspiration from the
+complementary learning systems observed in human cognition. Specifically, we
+propose to combine predictions from a CLIP zero-shot model and the
+exemplar-based model, using the zero-shot estimated probability that a sample's
+class is within the exemplar classes. We also propose a "tree probe" method, an
+adaption of lazy learning principles, which enables fast learning from new
+examples with competitive accuracy to batch-trained linear models. We test in
+data incremental, class incremental, and task incremental settings, as well as
+ability to perform flexible inference on varying subsets of zero-shot and
+learned categories. Our proposed method achieves a good balance of learning
+speed, target task effectiveness, and zero-shot effectiveness. Code will be
+available at https://github.com/jessemelpolio/TreeProbe.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalize Segment Anything Model with One Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renrui Zhang, Zhengkai Jiang, Ziyu Guo, Shilin Yan, Junting Pan, Xianzheng Ma, Hao Dong, Peng Gao, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by large-data pre-training, Segment Anything Model (SAM) has been
+demonstrated as a powerful and promptable framework, revolutionizing the
+segmentation models. Despite the generality, customizing SAM for specific
+visual concepts without man-powered prompting is under explored, e.g.,
+automatically segmenting your pet dog in different images. In this paper, we
+propose a training-free Personalization approach for SAM, termed as PerSAM.
+Given only a single image with a reference mask, PerSAM first localizes the
+target concept by a location prior, and segments it within other images or
+videos via three techniques: target-guided attention, target-semantic
+prompting, and cascaded post-refinement. In this way, we effectively adapt SAM
+for private use without any training. To further alleviate the mask ambiguity,
+we present an efficient one-shot fine-tuning variant, PerSAM-F. Freezing the
+entire SAM, we introduce two learnable weights for multi-scale masks, only
+training 2 parameters within 10 seconds for improved performance. To
+demonstrate our efficacy, we construct a new segmentation dataset, PerSeg, for
+personalized evaluation, and test our methods on video object segmentation with
+competitive performance. Besides, our approach can also enhance DreamBooth to
+personalize Stable Diffusion for text-to-image generation, which discards the
+background disturbance for better target appearance learning. Code is released
+at https://github.com/ZrrSkywalker/Personalize-SAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/ZrrSkywalker/Personalize-SAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeformUX-Net: Exploring a 3D Foundation Backbone for Medical Image
+  Segmentation with Depthwise Deformable Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ho Hin Lee, Quan Liu, Qi Yang, Xin Yu, Shunxing Bao, Yuankai Huo, Bennett A. Landman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of 3D ViTs to medical image segmentation has seen remarkable
+strides, somewhat overshadowing the budding advancements in Convolutional
+Neural Network (CNN)-based models. Large kernel depthwise convolution has
+emerged as a promising technique, showcasing capabilities akin to hierarchical
+transformers and facilitating an expansive effective receptive field (ERF)
+vital for dense predictions. Despite this, existing core operators, ranging
+from global-local attention to large kernel convolution, exhibit inherent
+trade-offs and limitations (e.g., global-local range trade-off, aggregating
+attentional features). We hypothesize that deformable convolution can be an
+exploratory alternative to combine all advantages from the previous operators,
+providing long-range dependency, adaptive spatial aggregation and computational
+efficiency as a foundation backbone. In this work, we introduce 3D
+DeformUX-Net, a pioneering volumetric CNN model that adeptly navigates the
+shortcomings traditionally associated with ViTs and large kernel convolution.
+Specifically, we revisit volumetric deformable convolution in depth-wise
+setting to adapt long-range dependency with computational efficiency. Inspired
+by the concepts of structural re-parameterization for convolution kernel
+weights, we further generate the deformable tri-planar offsets by adapting a
+parallel branch (starting from $1\times1\times1$ convolution), providing
+adaptive spatial aggregation across all channels. Our empirical evaluations
+reveal that the 3D DeformUX-Net consistently outperforms existing
+state-of-the-art ViTs and large kernel convolution models across four
+challenging public datasets, spanning various scales from organs (KiTS: 0.680
+to 0.720, MSD Pancreas: 0.676 to 0.717, AMOS: 0.871 to 0.902) to vessels (e.g.,
+MSD hepatic vessels: 0.635 to 0.671) in mean Dice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, the source code with our pre-trained model is available at
+  this https://github.com/MASILab/deform-uxnet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Mobile Digital-Twin Tracking via An RGBD-based
+  <span class="highlight-title">Transformer</span> Model and A Comprehensive Mobile <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixun Huang, Keling Yao, Seth Z. Zhao, Chuanyu Pan, Tianjian Xu, Weiyu Feng, Allen Y. Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The potential of digital-twin technology, involving the creation of precise
+digital replicas of physical objects, to reshape AR experiences in 3D object
+tracking and localization scenarios is significant. However, enabling robust 3D
+object tracking in dynamic mobile AR environments remains a formidable
+challenge. These scenarios often require a more robust pose estimator capable
+of handling the inherent sensor-level measurement noise. In this paper,
+recognizing the challenges of comprehensive solutions in existing literature,
+we propose a transformer-based 6DoF pose estimator designed to achieve
+state-of-the-art accuracy under real-world noisy data. To systematically
+validate the new solution's performance against the prior art, we also
+introduce a novel RGBD dataset called Digital Twin Tracking Dataset (DTTD) v2,
+which is focused on digital-twin object tracking scenarios. Expanded from an
+existing DTTD v1, the new dataset adds digital-twin data captured using a
+cutting-edge mobile RGBD sensor suite on Apple iPhone 14 Pro, expanding the
+applicability of our approach to iPhone sensor data. Through extensive
+experimentation and in-depth analysis, we illustrate the effectiveness of our
+methods under significant depth data errors, surpassing the performance of
+existing baselines. Code is made publicly available at:
+https://github.com/augcog/Robust-Digital-Twin-Tracking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Clipping: Differentially Private Deep Learning Made Easier and
+  Stronger <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07136v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07136v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqi Bu, Yu-Xiang Wang, Sheng Zha, George Karypis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Per-example gradient clipping is a key algorithmic step that enables
+practical differential private (DP) training for deep learning models. The
+choice of clipping threshold R, however, is vital for achieving high accuracy
+under DP. We propose an easy-to-use replacement, called automatic clipping,
+that eliminates the need to tune R for any DP optimizers, including DP-SGD,
+DP-Adam, DP-LAMB and many others. The automatic variants are as private and
+computationally efficient as existing DP optimizers, but require no DP-specific
+hyperparameters and thus make DP training as amenable as the standard
+non-private training. We give a rigorous convergence analysis of automatic
+DP-SGD in the non-convex setting, showing that it can enjoy an asymptotic
+convergence rate that matches the standard SGD, under a symmetric gradient
+noise assumption of the per-sample gradients (commonly used in the non-DP
+literature). We demonstrate on various language and vision tasks that automatic
+clipping outperforms or matches the state-of-the-art, and can be easily
+employed with minimal changes to existing codebases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RIC: Rotate-Inpaint-Complete for Generalizable Scene Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Kasahara, Shubham Agrawal, Selim Engin, Nikhil Chavan-Dafle, Shuran Song, Volkan Isler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  General scene reconstruction refers to the task of estimating the full 3D
+geometry and texture of a scene containing previously unseen objects. In many
+practical applications such as AR/VR, autonomous navigation, and robotics, only
+a single view of the scene may be available, making the scene reconstruction
+task challenging. In this paper, we present a method for scene reconstruction
+by structurally breaking the problem into two steps: rendering novel views via
+inpainting and 2D to 3D scene lifting. Specifically, we leverage the
+generalization capability of large visual language models (Dalle-2) to inpaint
+the missing areas of scene color images rendered from different views. Next, we
+lift these inpainted images to 3D by predicting normals of the inpainted image
+and solving for the missing depth values. By predicting for normals instead of
+depth directly, our method allows for robustness to changes in depth
+distributions and scale. With rigorous quantitative evaluation, we show that
+our method outperforms multiple baselines while providing generalization to
+novel objects and scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffeomorphic Multi-Resolution Deep Learning Registration for
+  Applications in Breast MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew G. French, Gonzalo D. Maso Talou, Thiranja P. Babarenda Gamage, Martyn P. Nash, Poul M. Nielsen, Anthony J. Doyle, Juan Eugenio Iglesias, Yaël Balbastre, Sean I. Young
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In breast surgical planning, accurate registration of MR images across
+patient positions has the potential to improve the localisation of tumours
+during breast cancer treatment. While learning-based registration methods have
+recently become the state-of-the-art approach for most medical image
+registration tasks, these methods have yet to make inroads into breast image
+registration due to certain difficulties-the lack of rich texture information
+in breast MR images and the need for the deformations to be diffeomophic. In
+this work, we propose learning strategies for breast MR image registration that
+are amenable to diffeomorphic constraints, together with early experimental
+results from in-silico and in-vivo experiments. One key contribution of this
+work is a registration network which produces superior registration outcomes
+for breast images in addition to providing diffeomorphic guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CrossGET: Cross-Guided Ensemble of Tokens for Accelerating
+  Vision-Language <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dachuan Shi, Chaofan Tao, Anyi Rao, Zhendong Yang, Chun Yuan, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision-language models have achieved tremendous progress far beyond
+what we ever expected. However, their computational costs are also dramatically
+growing with rapid development, especially for the large models. It makes model
+acceleration exceedingly critical in a scenario of limited resources. Although
+extensively studied for unimodal models, the acceleration for multimodal
+models, especially the vision-language Transformers, is relatively
+under-explored. To pursue more efficient and accessible vision-language
+Transformers, this paper introduces \textbf{Cross}-\textbf{G}uided
+\textbf{E}nsemble of \textbf{T}okens (\textbf{\emph{CrossGET}}), a universal
+acceleration framework for vision-language Transformers. This framework
+adaptively combines tokens through real-time, cross-modal guidance, thereby
+achieving substantial acceleration while keeping high performance.
+\textit{CrossGET} has two key innovations: 1) \textit{Cross-Guided Matching and
+Ensemble}. \textit{CrossGET} incorporates cross-modal guided token matching and
+ensemble to exploit cross-modal information effectively, only introducing
+cross-modal tokens with negligible extra parameters. 2) \textit{Complete-Graph
+Soft Matching}. In contrast to the existing bipartite soft matching approach,
+\textit{CrossGET} introduces a complete-graph soft matching policy to achieve
+more reliable token-matching results while maintaining parallelizability and
+high efficiency. Extensive experiments are conducted on various vision-language
+tasks, including image-text retrieval, visual reasoning, image captioning, and
+visual question answering. Performance on both classic multimodal architectures
+and emerging multimodal LLMs demonstrate the effectiveness and versatility of
+the proposed \textit{CrossGET} framework. The code will be at
+\url{https://github.com/sdc17/CrossGET}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Foreground Extraction via Deep Region Competition <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.15497v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.15497v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Yu, Sirui Xie, Xiaojian Ma, Yixin Zhu, Ying Nian Wu, Song-Chun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Deep Region Competition (DRC), an algorithm designed to extract
+foreground objects from images in a fully unsupervised manner. Foreground
+extraction can be viewed as a special case of generic image segmentation that
+focuses on identifying and disentangling objects from the background. In this
+work, we rethink the foreground extraction by reconciling energy-based prior
+with generative image modeling in the form of Mixture of Experts (MoE), where
+we further introduce the learned pixel re-assignment as the essential inductive
+bias to capture the regularities of background regions. With this modeling, the
+foreground-background partition can be naturally found through
+Expectation-Maximization (EM). We show that the proposed method effectively
+exploits the interaction between the mixture components during the partitioning
+process, which closely connects to region competition, a seminal approach for
+generic image segmentation. Experiments demonstrate that DRC exhibits more
+competitive performances on complex real-world data and challenging
+multi-object scenes compared with prior methods. Moreover, we show empirically
+that DRC can potentially generalize to novel foreground objects even from
+categories unseen during training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modality Cycles with Masked Conditional Diffusion for Unsupervised
+  Anomaly Segmentation in MRI <span class="chip">MICCAI
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16150v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16150v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyun Liang, Harry Anthony, Felix Wagner, Konstantinos Kamnitsas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly segmentation aims to detect patterns that are distinct
+from any patterns processed during training, commonly called abnormal or
+out-of-distribution patterns, without providing any associated manual
+segmentations. Since anomalies during deployment can lead to model failure,
+detecting the anomaly can enhance the reliability of models, which is valuable
+in high-risk domains like medical imaging. This paper introduces Masked
+Modality Cycles with Conditional Diffusion (MMCCD), a method that enables
+segmentation of anomalies across diverse patterns in multimodal MRI. The method
+is based on two fundamental ideas. First, we propose the use of cyclic modality
+translation as a mechanism for enabling abnormality detection.
+Image-translation models learn tissue-specific modality mappings, which are
+characteristic of tissue physiology. Thus, these learned mappings fail to
+translate tissues or image patterns that have never been encountered during
+training, and the error enables their segmentation. Furthermore, we combine
+image translation with a masked conditional diffusion model, which attempts to
+`imagine' what tissue exists under a masked area, further exposing unknown
+patterns as the generative model fails to recreate them. We evaluate our method
+on a proxy task by training on healthy-looking slices of BraTS2021
+multi-modality MRIs and testing on slices with tumors. We show that our method
+compares favorably to previous unsupervised approaches based on image
+reconstruction and denoising with autoencoders and diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SYRAC: Synthesize, Rank, and Count 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01662v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01662v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adriano D'Alessandro, Ali Mahdavi-Amiri, Ghassan Hamarneh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crowd counting is a critical task in computer vision, with several important
+applications. However, existing counting methods rely on labor-intensive
+density map annotations, necessitating the manual localization of each
+individual pedestrian. While recent efforts have attempted to alleviate the
+annotation burden through weakly or semi-supervised learning, these approaches
+fall short of significantly reducing the workload. We propose a novel approach
+to eliminate the annotation burden by leveraging latent diffusion models to
+generate synthetic data. However, these models struggle to reliably understand
+object quantities, leading to noisy annotations when prompted to produce images
+with a specific quantity of objects. To address this, we use latent diffusion
+models to create two types of synthetic data: one by removing pedestrians from
+real images, which generates ranked image pairs with a weak but reliable object
+quantity signal, and the other by generating synthetic images with a
+predetermined number of objects, offering a strong but noisy counting signal.
+Our method utilizes the ranking image pairs for pre-training and then fits a
+linear layer to the noisy synthetic images using these crowd quantity features.
+We report state-of-the-art results for unsupervised crowd counting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Numerical Weather Forecasting using Convolutional-LSTM with Attention
+  and Context Matcher Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2102.00696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2102.00696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Selim Furkan Tekin, Arda Fazla, Suleyman Serdar Kozat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical weather forecasting using high-resolution physical models often
+requires extensive computational resources on supercomputers, which diminishes
+their wide usage in most real-life applications. As a remedy, applying deep
+learning methods has revealed innovative solutions within this field. To this
+end, we introduce a novel deep learning architecture for forecasting
+high-resolution spatio-temporal weather data. Our approach extends the
+conventional encoder-decoder structure by integrating Convolutional Long-short
+Term Memory and Convolutional Neural Networks. In addition, we incorporate
+attention and context matcher mechanisms into the model architecture. Our
+Weather Model achieves significant performance improvements compared to
+baseline deep learning models, including ConvLSTM, TrajGRU, and U-Net. Our
+experimental evaluation involves high-scale, real-world benchmark numerical
+weather datasets, namely the ERA5 hourly dataset on pressure levels and
+WeatherBench. Our results demonstrate substantial improvements in identifying
+spatial and temporal correlations with attention matrices focusing on distinct
+parts of the input series to model atmospheric circulations. We also compare
+our model with high-resolution physical models using the benchmark metrics and
+show that our Weather Model is accurate and easy to interpret.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>- In our journal submission, we removed the integration of the
+  observational data section since it was not used in the experiments. Thus, we
+  also removed the authors from the paper who were responsible for that
+  section. - In the second version, we also performed an experiment on
+  WeatherBench. We compare our results with the Physical Weather Forecasting
+  Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-image Alignment for Diffusion-based Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neehar Kondapaneni, Markus Marks, Manuel Knott, Rogério Guimarães, Pietro Perona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are generative models with impressive text-to-image
+synthesis capabilities and have spurred a new wave of creative methods for
+classical machine learning tasks. However, the best way to harness the
+perceptual knowledge of these generative models for visual tasks is still an
+open question. Specifically, it is unclear how to use the prompting interface
+when applying diffusion backbones to vision tasks. We find that automatically
+generated captions can improve text-image alignment and significantly enhance a
+model's cross-attention maps, leading to better perceptual performance. Our
+approach improves upon the current SOTA in diffusion-based semantic
+segmentation on ADE20K and the current overall SOTA in depth estimation on
+NYUv2. Furthermore, our method generalizes to the cross-domain setting; we use
+model personalization and caption modifications to align our model to the
+target domain and find improvements over unaligned baselines. Our object
+detection model, trained on Pascal VOC, achieves SOTA results on Watercolor2K.
+Our segmentation method, trained on Cityscapes, achieves SOTA results on Dark
+Zurich-val and Nighttime Driving. Project page:
+https://www.vision.caltech.edu/tadp/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://www.vision.caltech.edu/tadp/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval meets Long Context Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extending the context window of large language models (LLMs) is getting
+popular recently, while the solution of augmenting LLMs with retrieval has
+existed for years. The natural questions are: i) Retrieval-augmentation versus
+long context window, which one is better for downstream tasks? ii) Can both
+methods be combined to get the best of both worlds? In this work, we answer
+these questions by studying both solutions using two state-of-the-art
+pretrained LLMs, i.e., a proprietary 43B GPT and LLaMA2-70B. Perhaps
+surprisingly, we find that LLM with 4K context window using simple
+retrieval-augmentation at generation can achieve comparable performance to
+finetuned LLM with 16K context window via positional interpolation on long
+context tasks, while taking much less computation. More importantly, we
+demonstrate that retrieval can significantly improve the performance of LLMs
+regardless of their extended context window sizes. Our best model,
+retrieval-augmented LLaMA2-70B with 32K context window, outperforms
+GPT-3.5-turbo-16k and Davinci003 in terms of average score on seven long
+context tasks including question answering and query-based summarization. It
+also outperforms its non-retrieval LLaMA2-70B-32k baseline by a margin, while
+being much faster at generation. Our study provides general insights on the
+choice of retrieval-augmentation versus long context extension of LLM for
+practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Potential Factors Leading to Popularity Unfairness in Recommender
+  Systems: A User-Centered Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud Mansoury, Finn Duijvestijn, Imane Mourabet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popularity bias is a well-known issue in recommender systems where few
+popular items are over-represented in the input data, while majority of other
+less popular items are under-represented. This disparate representation often
+leads to bias in exposure given to the items in the recommendation results.
+Extensive research examined this bias from item perspective and attempted to
+mitigate it by enhancing the recommendation of less popular items. However, a
+recent research has revealed the impact of this bias on users. Users with
+different degree of tolerance toward popular items are not fairly served by the
+recommendation system: users interested in less popular items receive more
+popular items in their recommendations, while users interested in popular items
+are recommended what they want. This is mainly due to the popularity bias that
+popular items are over-recommended. In this paper, we aim at investigating the
+factors leading to this user-side unfairness of popularity bias in recommender
+systems. In particular, we investigate two factors: 1) the relationship between
+this unfairness and users' interest toward items' categories (e.g., movie
+genres), 2) the relationship between this unfairness and the diversity of the
+popularity group in users' profile (the degree to which the user is interested
+in items with different degree of popularity). Experiments on a movie
+recommendation dataset using multiple recommendation algorithms show that these
+two factors are significantly correlated with the degree of popularity
+unfairness in the recommendation results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auto-FP: An Experimental Study of Automated Feature Preprocessing for
+  Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danrui Qi, Jinglin Peng, Yongjun He, Jiannan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical machine learning models, such as linear models and tree-based
+models, are widely used in industry. These models are sensitive to data
+distribution, thus feature preprocessing, which transforms features from one
+distribution to another, is a crucial step to ensure good model quality.
+Manually constructing a feature preprocessing pipeline is challenging because
+data scientists need to make difficult decisions about which preprocessors to
+select and in which order to compose them. In this paper, we study how to
+automate feature preprocessing (Auto-FP) for tabular data. Due to the large
+search space, a brute-force solution is prohibitively expensive. To address
+this challenge, we interestingly observe that Auto-FP can be modelled as either
+a hyperparameter optimization (HPO) or a neural architecture search (NAS)
+problem. This observation enables us to extend a variety of HPO and NAS
+algorithms to solve the Auto-FP problem. We conduct a comprehensive evaluation
+and analysis of 15 algorithms on 45 public ML datasets. Overall,
+evolution-based algorithms show the leading average ranking. Surprisingly, the
+random search turns out to be a strong baseline. Many surrogate-model-based and
+bandit-based search algorithms, which achieve good performance for HPO and NAS,
+do not outperform random search for Auto-FP. We analyze the reasons for our
+findings and conduct a bottleneck analysis to identify the opportunities to
+improve these algorithms. Furthermore, we explore how to extend Auto-FP to
+support parameter search and compare two ways to achieve this goal. In the end,
+we evaluate Auto-FP in an AutoML context and discuss the limitations of popular
+AutoML tools. To the best of our knowledge, this is the first study on
+automated feature preprocessing. We hope our work can inspire researchers to
+develop new algorithms tailored for Auto-FP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shaping the Epochal Individuality and Generality: The Temporal Dynamics
+  of Uncertainty and Prediction Error in Musical Improvisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuya Daikoku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Musical improvisation, much like spontaneous speech, reveals intricate facets
+of the improviser's state of mind and emotional character. However, the
+specific musical components that reveal such individuality remain largely
+unexplored. Within the framework of brain's statistical learning and predictive
+processing, this study examined the temporal dynamics of uncertainty and
+surprise (prediction error) in a piece of musical improvisation. This study
+employed the HBSL model to analyze a corpus of 456 Jazz improvisations,
+spanning 1905 to 2009, from 78 distinct Jazz musicians. The results indicated
+distinctive temporal patterns of surprise and uncertainty, especially in pitch
+and pitch-rhythm sequences, revealing era-specific features from the early 20th
+to the 21st centuries. Conversely, rhythm sequences exhibited a consistent
+degree of uncertainty across eras. Further, the acoustic properties remain
+unchanged across different periods. These findings highlight the importance of
+how temporal dynamics of surprise and uncertainty in improvisational music
+change over periods, profoundly influencing the distinctive methodologies
+artists adopt for improvisation in each era. Further, it is suggested that the
+development of improvisational music can be attributed to the brain's adaptive
+statistical learning mechanisms, which constantly refine internal models to
+mirror the cultural and emotional nuances of their respective epochs. This
+study unravels the evolutionary trajectory of improvisational music and
+highlights the nuanced shifts artists employ to resonate with the cultural and
+emotional landscapes of their times.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amazon Books Rating prediction & Recommendation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsiu-Ping Lin, Suman Chauhan, Yougender Chauhan, Nagender Chauhan, Jongwook Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper uses the dataset of Amazon to predict the books ratings listed on
+Amazon website. As part of this project, we predicted the ratings of the books,
+and also built a recommendation cluster. This recommendation cluster provides
+the recommended books based on the column's values from dataset, for instance,
+category, description, author, price, reviews etc. This paper provides a flow
+of handling big data files, data engineering, building models and providing
+predictions. The models predict book ratings column using various PySpark
+Machine Learning APIs. Additionally, we used hyper-parameters and parameters
+tuning. Also, Cross Validation and TrainValidationSplit were used for
+generalization. Finally, we performed a comparison between Binary
+Classification and Multiclass Classification in their accuracies. We converted
+our label from multiclass to binary to see if we could find any difference
+between the two classifications. As a result, we found out that we get higher
+accuracy in binary classification than in multiclass classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impedance Leakage Vulnerability and its Utilization in
+  Reverse-engineering Embedded Software 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Sadik Awal, Md Tauhidur Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering new vulnerabilities and implementing security and privacy
+measures are important to protect systems and data against physical attacks.
+One such vulnerability is impedance, an inherent property of a device that can
+be exploited to leak information through an unintended side channel, thereby
+posing significant security and privacy risks. Unlike traditional
+vulnerabilities, impedance is often overlooked or narrowly explored, as it is
+typically treated as a fixed value at a specific frequency in research and
+design endeavors. Moreover, impedance has never been explored as a source of
+information leakage. This paper demonstrates that the impedance of an embedded
+device is not constant and directly relates to the programs executed on the
+device. We define this phenomenon as impedance leakage and use this as a side
+channel to extract software instructions from protected memory. Our experiment
+on the ATmega328P microcontroller and the Artix 7 FPGA indicates that the
+impedance side channel can detect software instructions with 96.1% and 92.6%
+accuracy, respectively. Furthermore, we explore the dual nature of the
+impedance side channel, highlighting the potential for beneficial purposes and
+the associated risk of intellectual property theft. Finally, potential
+countermeasures that specifically address impedance leakage are discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CompoDiff: Versatile Composed Image Retrieval With Latent Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geonmo Gu, Sanghyuk Chun, Wonjae Kim, HeeJae Jun, Yoohoon Kang, Sangdoo Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel diffusion-based model, CompoDiff, for solving
+Composed Image Retrieval (CIR) with latent diffusion and presents a newly
+created dataset, named SynthTriplets18M, of 18 million reference images,
+conditions, and corresponding target image triplets to train the model.
+CompoDiff and SynthTriplets18M tackle the shortages of the previous CIR
+approaches, such as poor generalizability due to the small dataset scale and
+the limited types of conditions. CompoDiff not only achieves a new zero-shot
+state-of-the-art on four CIR benchmarks, including FashionIQ, CIRR, CIRCO, and
+GeneCIS, but also enables a more versatile and controllable CIR by accepting
+various conditions, such as negative text and image mask conditions, and the
+controllability to the importance between multiple queries or the trade-off
+between inference speed and the performance which are unavailable with existing
+CIR methods. The code and dataset are available at
+https://github.com/navervision/CompoDiff
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally; 26 pages, 4.1MB</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SR-HetGNN:Session-based Recommendation with Heterogeneous Graph Neural
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.05641v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.05641v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinpeng Chen, Haiyang Li, Xudong Zhang, Fan Zhang, Senzhang Wang, Kaimin Wei, Jiaqi Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of the Session-Based Recommendation System is to predict the
+user's next click according to the previous session sequence. The current
+studies generally learn user preferences according to the transitions of items
+in the user's session sequence. However, other effective information in the
+session sequence, such as user profiles, are largely ignored which may lead to
+the model unable to learn the user's specific preferences. In this paper, we
+propose a heterogeneous graph neural network-based session recommendation
+method, named SR-HetGNN, which can learn session embeddings by heterogeneous
+graph neural network (HetGNN), and capture the specific preferences of
+anonymous users. Specifically, SR-HetGNN first constructs heterogeneous graphs
+containing various types of nodes according to the session sequence, which can
+capture the dependencies among items, users, and sessions. Second, HetGNN
+captures the complex transitions between items and learns the item embeddings
+containing user information. Finally, to consider the influence of users' long
+and short-term preferences, local and global session embeddings are combined
+with the attentional network to obtain the final session embedding. SR-HetGNN
+is shown to be superior to the existing state-of-the-art session-based
+recommendation methods through extensive experiments over two real large
+datasets Diginetica and Tmall.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedCPT: Contrastive <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span>s with Large-scale PubMed
+  Search Logs for Zero-shot Biomedical Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Won Kim, Qingyu Chen, Donald C. Comeau, Lana Yeganova, W. John Wilbur, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information retrieval (IR) is essential in biomedical knowledge acquisition
+and clinical decision support. While recent progress has shown that language
+model encoders perform better semantic retrieval, training such models requires
+abundant query-article annotations that are difficult to obtain in biomedicine.
+As a result, most biomedical IR systems only conduct lexical matching. In
+response, we introduce MedCPT, a first-of-its-kind Contrastively Pre-trained
+Transformer model for zero-shot semantic IR in biomedicine. To train MedCPT, we
+collected an unprecedented scale of 255 million user click logs from PubMed.
+With such data, we use contrastive learning to train a pair of
+closely-integrated retriever and re-ranker. Experimental results show that
+MedCPT sets new state-of-the-art performance on six biomedical IR tasks,
+outperforming various baselines including much larger models such as
+GPT-3-sized cpt-text-XL. In addition, MedCPT also generates better biomedical
+article and sentence representations for semantic evaluations. As such, MedCPT
+can be readily applied to various real-world biomedical IR tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The MedCPT code and API are available at
+  https://github.com/ncbi/MedCPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconsidering Learning Objectives in Unbiased Recommendation with
+  Unobserved Confounders <span class="chip">KDD2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03851v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03851v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Xiao, Zhengyu Chen, Suhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies the problem of learning unbiased algorithms from biased
+feedback for recommendation. We address this problem from a novel distribution
+shift perspective. Recent works in unbiased recommendation have advanced the
+state-of-the-art with various techniques such as re-weighting, multi-task
+learning, and meta-learning. Despite their empirical successes, most of them
+lack theoretical guarantees, forming non-negligible gaps between theories and
+recent algorithms. In this paper, we propose a theoretical understanding of why
+existing unbiased learning objectives work for unbiased recommendation. We
+establish a close connection between unbiased recommendation and distribution
+shift, which shows that existing unbiased learning objectives implicitly align
+biased training and unbiased test distributions. Built upon this connection, we
+develop two generalization bounds for existing unbiased learning methods and
+analyze their learning behavior. Besides, as a result of the distribution
+shift, we further propose a principled framework, Adversarial Self-Training
+(AST), for unbiased recommendation. Extensive experiments on real-world and
+semi-synthetic datasets demonstrate the effectiveness of AST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LanguageMPC: Large Language Models as Decision Makers for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka, Wei Zhan, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based autonomous driving (AD) systems face challenges in
+comprehending high-level information, generalizing to rare events, and
+providing interpretability. To address these problems, this work employs Large
+Language Models (LLMs) as a decision-making component for complex AD scenarios
+that require human commonsense understanding. We devise cognitive pathways to
+enable comprehensive reasoning with LLMs, and develop algorithms for
+translating LLM decisions into actionable driving commands. Through this
+approach, LLM decisions are seamlessly integrated with low-level controllers by
+guided parameter matrix adaptation. Extensive experiments demonstrate that our
+proposed method not only consistently surpasses baseline approaches in
+single-vehicle tasks, but also helps handle complex driving behaviors even
+multi-vehicle coordination, thanks to the commonsense reasoning capabilities of
+LLMs. This paper presents an initial step toward leveraging LLMs as effective
+decision-makers for intricate AD scenarios in terms of safety, efficiency,
+generalizability, and interoperability. We aspire for it to serve as
+inspiration for future research in this field. Project page:
+https://sites.google.com/view/llm-mpc
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval meets Long Context Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extending the context window of large language models (LLMs) is getting
+popular recently, while the solution of augmenting LLMs with retrieval has
+existed for years. The natural questions are: i) Retrieval-augmentation versus
+long context window, which one is better for downstream tasks? ii) Can both
+methods be combined to get the best of both worlds? In this work, we answer
+these questions by studying both solutions using two state-of-the-art
+pretrained LLMs, i.e., a proprietary 43B GPT and LLaMA2-70B. Perhaps
+surprisingly, we find that LLM with 4K context window using simple
+retrieval-augmentation at generation can achieve comparable performance to
+finetuned LLM with 16K context window via positional interpolation on long
+context tasks, while taking much less computation. More importantly, we
+demonstrate that retrieval can significantly improve the performance of LLMs
+regardless of their extended context window sizes. Our best model,
+retrieval-augmented LLaMA2-70B with 32K context window, outperforms
+GPT-3.5-turbo-16k and Davinci003 in terms of average score on seven long
+context tasks including question answering and query-based summarization. It
+also outperforms its non-retrieval LLaMA2-70B-32k baseline by a margin, while
+being much faster at generation. Our study provides general insights on the
+choice of retrieval-augmentation versus long context extension of LLM for
+practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-oriented Representation Learning for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxiao Huo, Mingyu Ding, Chenfeng Xu, Thomas Tian, Xinghao Zhu, Yao Mu, Lingfeng Sun, Masayoshi Tomizuka, Wei Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans inherently possess generalizable visual representations that empower
+them to efficiently explore and interact with the environments in manipulation
+tasks. We advocate that such a representation automatically arises from
+simultaneously learning about multiple simple perceptual skills that are
+critical for everyday scenarios (e.g., hand detection, state estimate, etc.)
+and is better suited for learning robot manipulation policies compared to
+current state-of-the-art visual representations purely based on self-supervised
+objectives. We formalize this idea through the lens of human-oriented
+multi-task fine-tuning on top of pre-trained visual encoders, where each task
+is a perceptual skill tied to human-environment interactions. We introduce Task
+Fusion Decoder as a plug-and-play embedding translator that utilizes the
+underlying relationships among these perceptual skills to guide the
+representation learning towards encoding meaningful structure for what's
+important for all perceptual skills, ultimately empowering learning of
+downstream robotic manipulation tasks. Extensive experiments across a range of
+robotic tasks and embodiments, in both simulations and real-world environments,
+show that our Task Fusion Decoder consistently improves the representation of
+three state-of-the-art visual encoders including R3M, MVP, and EgoVLP, for
+downstream manipulation policy-learning. Project page:
+https://sites.google.com/view/human-oriented-robot-learning
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AstroCLIP: Cross-Modal <span class="highlight-title">Pre-Train</span>ing for Astronomical Foundation Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francois Lanusse, Liam Parker, Siavash Golkar, Miles Cranmer, Alberto Bietti, Michael Eickenberg, Geraud Krawezik, Michael McCabe, Ruben Ohana, Mariel Pettee, Bruno Regaldo-Saint Blancard, Tiberiu Tesileanu, Kyunghyun Cho, Shirley Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present AstroCLIP, a strategy to facilitate the construction of
+astronomical foundation models that bridge the gap between diverse
+observational modalities. We demonstrate that a cross-modal contrastive
+learning approach between images and optical spectra of galaxies yields highly
+informative embeddings of both modalities. In particular, we apply our method
+on multi-band images and optical spectra from the Dark Energy Spectroscopic
+Instrument (DESI), and show that: (1) these embeddings are well-aligned between
+modalities and can be used for accurate cross-modal searches, and (2) these
+embeddings encode valuable physical information about the galaxies -- in
+particular redshift and stellar mass -- that can be used to achieve competitive
+zero- and few- shot predictions without further finetuning. Additionally, in
+the process of developing our approach, we also construct a novel,
+transformer-based model and pretraining approach for processing galaxy spectra.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the NeurIPS 2023 AI4Science Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decision ConvFormer: Local Filtering in MetaFormer is Sufficient for
+  Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeonghye Kim, Suyoung Lee, Woojun Kim, Youngchul Sung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent success of Transformer in natural language processing has sparked
+its use in various domains. In offline reinforcement learning (RL), Decision
+Transformer (DT) is emerging as a promising model based on Transformer.
+However, we discovered that the attention module of DT is not appropriate to
+capture the inherent local dependence pattern in trajectories of RL modeled as
+a Markov decision process. To overcome the limitations of DT, we propose a
+novel action sequence predictor, named Decision ConvFormer (DC), based on the
+architecture of MetaFormer, which is a general structure to process multiple
+entities in parallel and understand the interrelationship among the multiple
+entities. DC employs local convolution filtering as the token mixer and can
+effectively capture the inherent local associations of the RL dataset. In
+extensive experiments, DC achieved state-of-the-art performance across various
+standard RL benchmarks while requiring fewer resources. Furthermore, we show
+that DC better understands the underlying meaning in data and exhibits enhanced
+generalization capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding In-Context Learning in <span class="highlight-title">Transformer</span>s and LLMs by Learning
+  to Learn Discrete Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satwik Bhattamishra, Arkil Patel, Phil Blunsom, Varun Kanade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to understand the in-context learning phenomenon, recent works have
+adopted a stylized experimental framework and demonstrated that Transformers
+can learn gradient-based learning algorithms for various classes of real-valued
+functions. However, the limitations of Transformers in implementing learning
+algorithms, and their ability to learn other forms of algorithms are not well
+understood. Additionally, the degree to which these capabilities are confined
+to attention-based models is unclear. Furthermore, it remains to be seen
+whether the insights derived from these stylized settings can be extrapolated
+to pretrained Large Language Models (LLMs). In this work, we take a step
+towards answering these questions by demonstrating the following: (a) On a
+test-bed with a variety of Boolean function classes, we find that Transformers
+can nearly match the optimal learning algorithm for 'simpler' tasks, while
+their performance deteriorates on more 'complex' tasks. Additionally, we find
+that certain attention-free models perform (almost) identically to Transformers
+on a range of tasks. (b) When provided a teaching sequence, i.e. a set of
+examples that uniquely identifies a function in a class, we show that
+Transformers learn more sample-efficiently. Interestingly, our results show
+that Transformers can learn to implement two distinct algorithms to solve a
+single task, and can adaptively select the more sample-efficient algorithm
+depending on the sequence of in-context examples. (c) Lastly, we show that
+extant LLMs, e.g. LLaMA-2, GPT-4, can compete with nearest-neighbor baselines
+on prediction tasks that are guaranteed to not be in their training set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SemiReward: A General Reward Model for Semi-supervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Li, Weiyang Jin, Zedong Wang, Fang Wu, Zicheng Liu, Cheng Tan, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning (SSL) has witnessed great progress with various
+improvements in the self-training framework with pseudo labeling. The main
+challenge is how to distinguish high-quality pseudo labels against the
+confirmation bias. However, existing pseudo-label selection strategies are
+limited to pre-defined schemes or complex hand-crafted policies specially
+designed for classification, failing to achieve high-quality labels, fast
+convergence, and task versatility simultaneously. To these ends, we propose a
+Semi-supervised Reward framework (SemiReward) that predicts reward scores to
+evaluate and filter out high-quality pseudo labels, which is pluggable to
+mainstream SSL methods in wide task types and scenarios. To mitigate
+confirmation bias, SemiReward is trained online in two stages with a generator
+model and subsampling strategy. With classification and regression tasks on 13
+standard SSL benchmarks of three modalities, extensive experiments verify that
+SemiReward achieves significant performance gains and faster convergence speeds
+upon Pseudo Label, FlexMatch, and Free/SoftMatch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of 22 pages with the source code at
+  \url{https://github.com/Westlake-AI/SemiReward}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-dimensional SGD aligns with emerging outlier eigenspaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerard Ben Arous, Reza Gheissari, Jiaoyang Huang, Aukosh Jagannath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We rigorously study the joint evolution of training dynamics via stochastic
+gradient descent (SGD) and the spectra of empirical Hessian and gradient
+matrices. We prove that in two canonical classification tasks for multi-class
+high-dimensional mixtures and either 1 or 2-layer neural networks, the SGD
+trajectory rapidly aligns with emerging low-rank outlier eigenspaces of the
+Hessian and gradient matrices. Moreover, in multi-layer settings this alignment
+occurs per layer, with the final layer's outlier eigenspace evolving over the
+course of training, and exhibiting rank deficiency when the SGD converges to
+sub-optimal classifiers. This establishes some of the rich predictions that
+have arisen from extensive numerical studies in the last decade about the
+spectra of Hessian and information matrices over the course of training in
+overparametrized networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft Convex Quantization: Revisiting Vector Quantization with Convex
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Gautam, Reid Pryzant, Ziyi Yang, Chenguang Zhu, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vector Quantization (VQ) is a well-known technique in deep learning for
+extracting informative discrete latent representations. VQ-embedded models have
+shown impressive results in a range of applications including image and speech
+generation. VQ operates as a parametric K-means algorithm that quantizes inputs
+using a single codebook vector in the forward pass. While powerful, this
+technique faces practical challenges including codebook collapse,
+non-differentiability and lossy compression. To mitigate the aforementioned
+issues, we propose Soft Convex Quantization (SCQ) as a direct substitute for
+VQ. SCQ works like a differentiable convex optimization (DCO) layer: in the
+forward pass, we solve for the optimal convex combination of codebook vectors
+that quantize the inputs. In the backward pass, we leverage differentiability
+through the optimality conditions of the forward solution. We then introduce a
+scalable relaxation of the SCQ optimization and demonstrate its efficacy on the
+CIFAR-10, GTSRB and LSUN datasets. We train powerful SCQ autoencoder models
+that significantly outperform matched VQ-based architectures, observing an
+order of magnitude better image reconstruction and codebook usage with
+comparable quantization runtime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning characteristic parameters and dynamics of centrifugal pumps
+  under multi-phase flow using physics-informed neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felipe de Castro Teixeira Carvalho, Kamaljyoti Nath, Alberto Luiz Serpa, George Em Karniadakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrical submersible pumps (ESP) are the second most used artificial
+lifting equipment in the oil and gas industry due to their high flow rates and
+boost pressures. They often have to handle multiphase flows, which usually
+contain a mixture of hydrocarbons, water, and/or sediments. Given these
+circumstances, emulsions are commonly formed. It is a liquid-liquid flow
+composed of two immiscible fluids whose effective viscosity and density differ
+from the single phase separately. In this context, accurate modeling of ESP
+systems is crucial for optimizing oil production and implementing control
+strategies. However, real-time and direct measurement of fluid and system
+characteristics is often impractical due to time constraints and economy.
+Hence, indirect methods are generally considered to estimate the system
+parameters. In this paper, we formulate a machine learning model based on
+Physics-Informed Neural Networks (PINNs) to estimate crucial system parameters.
+In order to study the efficacy of the proposed PINN model, we conduct
+computational studies using not only simulated but also experimental data for
+different water-oil ratios. We evaluate the state variable's dynamics and
+unknown parameters for various combinations when only intake and discharge
+pressure measurements are available. We also study structural and practical
+identifiability analyses based on commonly available pressure measurements. The
+PINN model could reduce the requirement of expensive field laboratory tests
+used to estimate fluid properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECoFLaP: Efficient Coarse-to-Fine Layer-Wise Pruning for Vision-Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Sung, Jaehong Yoon, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) can understand the world comprehensively
+by integrating rich information from different modalities, achieving remarkable
+performance improvements on various multimodal downstream tasks. However,
+deploying LVLMs is often problematic due to their massive computational/energy
+costs and carbon consumption. Such issues make it infeasible to adopt
+conventional iterative global pruning, which is costly due to computing the
+Hessian matrix of the entire large model for sparsification. Alternatively,
+several studies have recently proposed layer-wise pruning approaches to avoid
+the expensive computation of global pruning and efficiently compress model
+weights according to their importance within a layer. However, these methods
+often suffer from suboptimal model compression due to their lack of a global
+perspective. To address this limitation in recent efficient pruning methods for
+large models, we propose Efficient Coarse-to-Fine Layer-Wise Pruning (ECoFLaP),
+a two-stage coarse-to-fine weight pruning approach for LVLMs. We first
+determine the sparsity ratios of different layers or blocks by leveraging the
+global importance score, which is efficiently computed based on the
+zeroth-order approximation of the global model gradients. Then, the multimodal
+model performs local layer-wise unstructured weight pruning based on
+globally-informed sparsity ratios. We validate our proposed method across
+various multimodal and unimodal models and datasets, demonstrating significant
+performance improvements over prevalent pruning techniques in the high-sparsity
+regime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ecoflap.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IBCL: Zero-shot Model Generation for Task Trade-offs in Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyuan Lu, Michele Caprio, Eric Eaton, Insup Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Like generic multi-task learning, continual learning has the nature of
+multi-objective optimization, and therefore faces a trade-off between the
+performance of different tasks. That is, to optimize for the current task
+distribution, it may need to compromise performance on some previous tasks.
+This means that there exist multiple models that are Pareto-optimal at
+different times, each addressing a distinct task performance trade-off.
+Researchers have discussed how to train particular models to address specific
+trade-off preferences. However, existing algorithms require training overheads
+proportional to the number of preferences -- a large burden when there are
+multiple, possibly infinitely many, preferences. As a response, we propose
+Imprecise Bayesian Continual Learning (IBCL). Upon a new task, IBCL (1) updates
+a knowledge base in the form of a convex hull of model parameter distributions
+and (2) obtains particular models to address task trade-off preferences with
+zero-shot. That is, IBCL does not require any additional training overhead to
+generate preference-addressing models from its knowledge base. We show that
+models obtained by IBCL have guarantees in identifying the Pareto optimal
+parameters. Moreover, experiments on standard image classification and NLP
+tasks support this guarantee. Statistically, IBCL improves average per-task
+accuracy by at most 23\% and peak per-task accuracy by at most 15\% with
+respect to the baseline methods, with steadily near-zero or positive backward
+transfer. Most importantly, IBCL significantly reduces the training overhead
+from training 1 model per preference to at most 3 models for all preferences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2305.14782</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiple Physics <span class="highlight-title">Pretrain</span>ing for Physical Surrogate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael McCabe, Bruno Régaldo-Saint Blancard, Liam Holden Parker, Ruben Ohana, Miles Cranmer, Alberto Bietti, Michael Eickenberg, Siavash Golkar, Geraud Krawezik, Francois Lanusse, Mariel Pettee, Tiberiu Tesileanu, Kyunghyun Cho, Shirley Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce multiple physics pretraining (MPP), an autoregressive
+task-agnostic pretraining approach for physical surrogate modeling. MPP
+involves training large surrogate models to predict the dynamics of multiple
+heterogeneous physical systems simultaneously by learning features that are
+broadly useful across diverse physical tasks. In order to learn effectively in
+this setting, we introduce a shared embedding and normalization strategy that
+projects the fields of multiple systems into a single shared embedding space.
+We validate the efficacy of our approach on both pretraining and downstream
+tasks over a broad fluid mechanics-oriented benchmark. We show that a single
+MPP-pretrained transformer is able to match or outperform task-specific
+baselines on all pretraining sub-tasks without the need for finetuning. For
+downstream tasks, we demonstrate that finetuning MPP-trained models results in
+more accurate predictions across multiple time-steps on new physics compared to
+training from scratch or finetuning pretrained video foundation models. We
+open-source our code and model weights trained at multiple scales for
+reproducibility and community experimentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xVal: A Continuous Number Encoding for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siavash Golkar, Mariel Pettee, Michael Eickenberg, Alberto Bietti, Miles Cranmer, Geraud Krawezik, Francois Lanusse, Michael McCabe, Ruben Ohana, Liam Parker, Bruno Régaldo-Saint Blancard, Tiberiu Tesileanu, Kyunghyun Cho, Shirley Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models have not yet been broadly adapted for the analysis of
+scientific datasets due in part to the unique difficulties of tokenizing
+numbers. We propose xVal, a numerical encoding scheme that represents any real
+number using just a single token. xVal represents a given real number by
+scaling a dedicated embedding vector by the number value. Combined with a
+modified number-inference approach, this strategy renders the model end-to-end
+continuous when considered as a map from the numbers of the input string to
+those of the output string. This leads to an inductive bias that is generally
+more suitable for applications in scientific domains. We empirically evaluate
+our proposal on a number of synthetic and real-world datasets. Compared with
+existing number encoding schemes, we find that xVal is more token-efficient and
+demonstrates improved generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages 7 figures. Supplementary: 5 pages 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variance Reduced Halpern Iteration for Finite-Sum Monotone Inclusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xufeng Cai, Ahmet Alacaoglu, Jelena Diakonikolas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning approaches relying on such criteria as adversarial
+robustness or multi-agent settings have raised the need for solving
+game-theoretic equilibrium problems. Of particular relevance to these
+applications are methods targeting finite-sum structure, which generically
+arises in empirical variants of learning problems in these contexts. Further,
+methods with computable approximation errors are highly desirable, as they
+provide verifiable exit criteria. Motivated by these applications, we study
+finite-sum monotone inclusion problems, which model broad classes of
+equilibrium problems. Our main contributions are variants of the classical
+Halpern iteration that employ variance reduction to obtain improved complexity
+guarantees in which $n$ component operators in the finite sum are ``on
+average'' either cocoercive or Lipschitz continuous and monotone, with
+parameter $L$. The resulting oracle complexity of our methods, which provide
+guarantees for the last iterate and for a (computable) operator norm residual,
+is $\widetilde{\mathcal{O}}( n + \sqrt{n}L\varepsilon^{-1})$, which improves
+upon existing methods by a factor up to $\sqrt{n}$. This constitutes the first
+variance reduction-type result for general finite-sum monotone inclusions and
+for more specific problems such as convex-concave optimization when operator
+norm residual is the optimality measure. We further argue that, up to
+poly-logarithmic factors, this complexity is unimprovable in the monotone
+Lipschitz setting; i.e., the provided result is near-optimal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Impact of Disrupted Peer-to-Peer Communications on Fully
+  Decentralized Learning in Disaster Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luigi Palmieri, Chiara Boldrini, Lorenzo Valerio, Andrea Passarella, Marco Conti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully decentralized learning enables the distribution of learning resources
+and decision-making capabilities across multiple user devices or nodes, and is
+rapidly gaining popularity due to its privacy-preserving and decentralized
+nature. Importantly, this crowdsourcing of the learning process allows the
+system to continue functioning even if some nodes are affected or disconnected.
+In a disaster scenario, communication infrastructure and centralized systems
+may be disrupted or completely unavailable, hindering the possibility of
+carrying out standard centralized learning tasks in these settings. Thus, fully
+decentralized learning can help in this case. However, transitioning from
+centralized to peer-to-peer communications introduces a dependency between the
+learning process and the topology of the communication graph among nodes. In a
+disaster scenario, even peer-to-peer communications are susceptible to abrupt
+changes, such as devices running out of battery or getting disconnected from
+others due to their position. In this study, we investigate the effects of
+various disruptions to peer-to-peer communications on decentralized learning in
+a disaster setting. We examine the resilience of a decentralized learning
+process when a subset of devices drop from the process abruptly. To this end,
+we analyze the difference between losing devices holding data, i.e., potential
+knowledge, vs. devices contributing only to the graph connectivity, i.e., with
+no data. Our findings on a Barabasi-Albert graph topology, where training data
+is distributed across nodes in an IID fashion, indicate that the accuracy of
+the learning process is more affected by a loss of connectivity than by a loss
+of data. Nevertheless, the network remains relatively robust, and the learning
+process can achieve a good level of accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE ICT-DM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Laws for Associative Memories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivien Cabannes, Elvis Dohmatob, Alberto Bietti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning arguably involves the discovery and memorization of abstract rules.
+The aim of this paper is to study associative memory mechanisms. Our model is
+based on high-dimensional matrices consisting of outer products of embeddings,
+which relates to the inner layers of transformer language models. We derive
+precise scaling laws with respect to sample size and parameter size, and
+discuss the statistical efficiency of different estimators, including
+optimization-based algorithms. We provide extensive numerical experiments to
+validate and interpret theoretical results, including fine-grained
+visualizations of the stored memory associations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Never Train from Scratch: Fair Comparison of Long-Sequence Models
+  Requires Data-Driven Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ido Amos, Jonathan Berant, Ankit Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling long-range dependencies across sequences is a longstanding goal in
+machine learning and has led to architectures, such as state space models, that
+dramatically outperform Transformers on long sequences. However, these
+impressive empirical gains have been by and large demonstrated on benchmarks
+(e.g. Long Range Arena), where models are randomly initialized and trained to
+predict a target label from an input sequence. In this work, we show that
+random initialization leads to gross overestimation of the differences between
+architectures and that pretraining with standard denoising objectives, using
+$\textit{only the downstream task data}$, leads to dramatic gains across
+multiple architectures and to very small gaps between Transformers and state
+space models (SSMs). In stark contrast to prior works, we find vanilla
+Transformers to match the performance of S4 on Long Range Arena when properly
+pretrained, and we improve the best reported results of SSMs on the PathX-256
+task by 20 absolute points. Subsequently, we analyze the utility of
+previously-proposed structured parameterizations for SSMs and show they become
+mostly redundant in the presence of data-driven initialization obtained through
+pretraining. Our work shows that, when evaluating different architectures on
+supervised tasks, incorporation of data-driven priors via pretraining is
+essential for reliable performance estimation, and can be done efficiently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze He, Yushi Bai, Matthieu Lin, Wang Zhao, Yubin Hu, Jenny Sheng, Ran Yi, Juanzi Li, Yong-Jin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent methods in text-to-3D leverage powerful pretrained diffusion models to
+optimize NeRF. Notably, these methods are able to produce high-quality 3D
+scenes without training on 3D data. Due to the open-ended nature of the task,
+most studies evaluate their results with subjective case studies and user
+experiments, thereby presenting a challenge in quantitatively addressing the
+question: How has current progress in Text-to-3D gone so far? In this paper, we
+introduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing
+diverse text prompts of three increasing complexity levels that are specially
+designed for 3D generation. To assess both the subjective quality and the text
+alignment, we propose two automatic metrics based on multi-view images produced
+by the 3D contents. The quality metric combines multi-view text-image scores
+and regional convolution to detect quality and view inconsistency. The
+alignment metric uses multi-view captioning and Large Language Model (LLM)
+evaluation to measure text-3D consistency. Both metrics closely correlate with
+different dimensions of human judgments, providing a paradigm for efficiently
+evaluating text-to-3D models. The benchmarking results, shown in Fig. 1, reveal
+performance differences among six prevalent text-to-3D methods. Our analysis
+further highlights the common struggles for current methods on generating
+surroundings and multi-object scenes, as well as the bottleneck of leveraging
+2D guidance for 3D generation. Our project page is available at:
+https://t3bench.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fully Adaptive Regret Minimization in Heavy-Tailed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianmarco Genalti, Lupo Marsigli, Nicola Gatti, Alberto Maria Metelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heavy-tailed distributions naturally arise in many settings, from finance to
+telecommunications. While regret minimization under sub-Gaussian or bounded
+support rewards has been widely studied, learning on heavy-tailed distributions
+only gained popularity over the last decade. In the stochastic heavy-tailed
+bandit problem, an agent learns under the assumption that the distributions
+have finite moments of maximum order $1+\epsilon$ which are uniformly bounded
+by a constant $u$, for some $\epsilon \in (0,1]$. To the best of our knowledge,
+literature only provides algorithms requiring these two quantities as an input.
+In this paper, we study the stochastic adaptive heavy-tailed bandit, a
+variation of the standard setting where both $\epsilon$ and $u$ are unknown to
+the agent. We show that adaptivity comes at a cost, introducing two lower
+bounds on the regret of any adaptive algorithm, implying a higher regret w.r.t.
+the standard setting. Finally, we introduce a specific distributional
+assumption and provide Adaptive Robust UCB, a regret minimization strategy
+matching the known lower bound for the heavy-tailed MAB problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast, Expressive SE$(n)$ Equivariant Networks through Weight-Sharing in
+  Position-Orientation Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik J Bekkers, Sharvaree Vadgama, Rob D Hesselink, Putri A van der Linden, David W Romero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Based on the theory of homogeneous spaces we derive \textit{geometrically
+optimal edge attributes} to be used within the flexible message passing
+framework. We formalize the notion of weight sharing in convolutional networks
+as the sharing of message functions over point-pairs that should be treated
+equally. We define equivalence classes of point-pairs that are identical up to
+a transformation in the group and derive attributes that uniquely identify
+these classes. Weight sharing is then obtained by conditioning message
+functions on these attributes. As an application of the theory, we develop an
+efficient equivariant group convolutional network for processing 3D point
+clouds. The theory of homogeneous spaces tells us how to do group convolutions
+with feature maps over the homogeneous space of positions $\mathbb{R}^3$,
+position and orientations $\mathbb{R}^3 {\times} S^2$, and the group SE$(3)$
+itself. Among these, $\mathbb{R}^3 {\times} S^2$ is an optimal choice due to
+the ability to represent directional information, which $\mathbb{R}^3$ methods
+cannot, and it significantly enhances computational efficiency compared to
+indexing features on the full SE$(3)$ group. We empirically support this claim
+by reaching state-of-the-art results -- in accuracy and speed -- on three
+different benchmarks: interatomic potential energy prediction, trajectory
+forecasting in N-body systems, and generating molecules via equivariant
+diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is publicly available at https://github.com/ebekkers/ponita</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Conic Proxies for AC Optimal Power Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guancheng Qiu, Mathieu Tanneau, Pascal Van Hentenryck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been significant interest in the development of
+machine learning-based optimization proxies for AC Optimal Power Flow (AC-OPF).
+Although significant progress has been achieved in predicting high-quality
+primal solutions, no existing learning-based approach can provide valid dual
+bounds for AC-OPF. This paper addresses this gap by training optimization
+proxies for a convex relaxation of AC-OPF. Namely, the paper considers a
+second-order cone (SOC) relaxation of ACOPF, and proposes a novel dual
+architecture that embeds a fast, differentiable (dual) feasibility recovery,
+thus providing valid dual bounds. The paper combines this new architecture with
+a self-supervised learning scheme, which alleviates the need for costly
+training data generation. Extensive numerical experiments on medium- and
+large-scale power grids demonstrate the efficiency and scalability of the
+proposed methodology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-modeling the Sequential and Graphical Route for Peptide 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Ge Wang, Jiaqi Wang, Jiangbin Zheng, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peptides are formed by the dehydration condensation of multiple amino acids.
+The primary structure of a peptide can be represented either as an amino acid
+sequence or as a molecular graph consisting of atoms and chemical bonds.
+Previous studies have indicated that deep learning routes specific to
+sequential and graphical peptide forms exhibit comparable performance on
+downstream tasks. Despite the fact that these models learn representations of
+the same modality of peptides, we find that they explain their predictions
+differently. Considering sequential and graphical models as two experts making
+inferences from different perspectives, we work on fusing expert knowledge to
+enrich the learned representations for improving the discriminative
+performance. To achieve this, we propose a peptide co-modeling method, RepCon,
+which employs a contrastive learning-based framework to enhance the mutual
+information of representations from decoupled sequential and graphical
+end-to-end models. It considers representations from the sequential encoder and
+the graphical encoder for the same peptide sample as a positive pair and learns
+to enhance the consistency of representations between positive sample pairs and
+to repel representations between negative pairs. Empirical studies of RepCon
+and other co-modeling methods are conducted on open-source discriminative
+datasets, including aggregation propensity, retention time, antimicrobial
+peptide prediction, and family classification from Peptide Database. Our
+results demonstrate the superiority of the co-modeling approach over
+independent modeling, as well as the superiority of RepCon over other methods
+under the co-modeling framework. In addition, the attribution on RepCon further
+corroborates the validity of the approach at the level of model explanation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Credit card score prediction using machine learning models: A new
+  <span class="highlight-title">dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Arram, Masri Ayob, Musatafa Abbas Abbood Albadr, Alaa Sulaiman, Dheeb Albashish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of credit cards has recently increased, creating an essential need
+for credit card assessment methods to minimize potential risks. This study
+investigates the utilization of machine learning (ML) models for credit card
+default prediction system. The main goal here is to investigate the
+best-performing ML model for new proposed credit card scoring dataset. This new
+dataset includes credit card transaction histories and customer profiles, is
+proposed and tested using a variety of machine learning algorithms, including
+logistic regression, decision trees, random forests, multi layer perceptron
+(MLP) neural network, XGBoost, and LightGBM. To prepare the data for machine
+learning models, we perform data pre-proccessing, feature extraction, feature
+selection, and data balancing techniques. Experimental results demonstrate that
+MLP outperforms logistic regression, decision trees, random forests, LightGBM,
+and XGBoost in terms of predictive performance in true positive rate, achieving
+an impressive area under the curve (AUC) of 86.7% and an accuracy rate of
+91.6%, with a recall rate exceeding 80%. These results indicate the superiority
+of MLP in predicting the default customers and assessing the potential risks.
+Furthermore, they help banks and other financial institutions in predicting
+loan defaults at an earlier stage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Fisher-Rao gradient flow for entropy-regularised Markov decision
+  processes in Polish spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bekzhan Kerimkulov, James-Michael Leahy, David Siska, Lukasz Szpruch, Yufei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the global convergence of a Fisher-Rao policy gradient flow for
+infinite-horizon entropy-regularised Markov decision processes with Polish
+state and action space. The flow is a continuous-time analogue of a policy
+mirror descent method. We establish the global well-posedness of the gradient
+flow and demonstrate its exponential convergence to the optimal policy.
+Moreover, we prove the flow is stable with respect to gradient evaluation,
+offering insights into the performance of a natural policy gradient flow with
+log-linear policy parameterisation. To overcome challenges stemming from the
+lack of the convexity of the objective function and the discontinuity arising
+from the entropy regulariser, we leverage the performance difference lemma and
+the duality relationship between the gradient and mirror descent flows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shadow Alignment: The Ease of Subverting Safely-Aligned Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianjun Yang, Xiao Wang, Qi Zhang, Linda Petzold, William Yang Wang, Xun Zhao, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Warning: This paper contains examples of harmful language, and reader
+discretion is recommended. The increasing open release of powerful large
+language models (LLMs) has facilitated the development of downstream
+applications by reducing the essential cost of data annotation and computation.
+To ensure AI safety, extensive safety-alignment measures have been conducted to
+armor these models against malicious use (primarily hard prompt attack).
+However, beneath the seemingly resilient facade of the armor, there might lurk
+a shadow. By simply tuning on 100 malicious examples with 1 GPU hour, these
+safely aligned LLMs can be easily subverted to generate harmful content.
+Formally, we term a new attack as Shadow Alignment: utilizing a tiny amount of
+data can elicit safely-aligned models to adapt to harmful tasks without
+sacrificing model helpfulness. Remarkably, the subverted models retain their
+capability to respond appropriately to regular inquiries. Experiments across 8
+models released by 5 different organizations (LLaMa-2, Falcon, InternLM,
+BaiChuan2, Vicuna) demonstrate the effectiveness of shadow alignment attack.
+Besides, the single-turn English-only attack successfully transfers to
+multi-turn dialogue and other languages. This study serves as a clarion call
+for a collective effort to overhaul and fortify the safety of open-source LLMs
+against malicious attackers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HappyFeat -- An interactive and efficient BCI framework for clinical
+  applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Desbois, Tristan Venot, Fabrizio De Vico Fallani, Marie-Constance Corsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain-Computer Interface (BCI) systems allow users to perform actions by
+translating their brain activity into commands. Such systems usually need a
+training phase, consisting in training a classification algorithm to
+discriminate between mental states using specific features from the recorded
+signals. This phase of feature selection and training is crucial for BCI
+performance and presents specific constraints to be met in a clinical context,
+such as post-stroke rehabilitation.
+  In this paper, we present HappyFeat, a software making Motor Imagery (MI)
+based BCI experiments easier, by gathering all necessary manipulations and
+analysis in a single convenient GUI and via automation of experiment or
+analysis parameters. The resulting workflow allows for effortlessly selecting
+the best features, helping to achieve good BCI performance in time-constrained
+environments. Alternative features based on Functional Connectivity can be used
+and compared or combined with Power Spectral Density, allowing a
+network-oriented approach.
+  We then give details of HappyFeat's main mechanisms, and a review of its
+performances in typical use cases. We also show that it can be used as an
+efficient tool for comparing different metrics extracted from the signals, to
+train the classification algorithm. To this end, we show a comparison between
+the commonly-used Power Spectral Density and network metrics based on
+Functional Connectivity.
+  HappyFeat is available as an open-source project which can be freely
+downloaded on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures, 2 tables, "Annex" section</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Constraint Tightening in Stochastic Model Predictive Control: A
+  Regression Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Capone, Tim Brüdigam, Sandra Hirche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving chance-constrained stochastic optimal control problems is a
+significant challenge in control. This is because no analytical solutions exist
+for up to a handful of special cases. A common and computationally efficient
+approach for tackling chance-constrained stochastic optimal control problems
+consists of reformulating the chance constraints as hard constraints with a
+constraint-tightening parameter. However, in such approaches, the choice of
+constraint-tightening parameter remains challenging, and guarantees can mostly
+be obtained assuming that the process noise distribution is known a priori.
+Moreover, the chance constraints are often not tightly satisfied, leading to
+unnecessarily high costs. This work proposes a data-driven approach for
+learning the constraint-tightening parameters online during control. To this
+end, we reformulate the choice of constraint-tightening parameter for the
+closed-loop as a binary regression problem. We then leverage a highly
+expressive \gls{gp} model for binary regression to approximate the smallest
+constraint-tightening parameters that satisfy the chance constraints. By tuning
+the algorithm parameters appropriately, we show that the resulting
+constraint-tightening parameters satisfy the chance constraints up to an
+arbitrarily small margin with high probability. Our approach yields
+constraint-tightening parameters that tightly satisfy the chance constraints in
+numerical experiments, resulting in a lower average cost than three other
+state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Transactions on Automatic Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hoeffding's Inequality for Markov Chains under Generalized
+  Concentrability Condition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Abhishek Gupta, Yin Sun, Ness Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies Hoeffding's inequality for Markov chains under the
+generalized concentrability condition defined via integral probability metric
+(IPM). The generalized concentrability condition establishes a framework that
+interpolates and extends the existing hypotheses of Markov chain Hoeffding-type
+inequalities. The flexibility of our framework allows Hoeffding's inequality to
+be applied beyond the ergodic Markov chains in the traditional sense. We
+demonstrate the utility by applying our framework to several non-asymptotic
+analyses arising from the field of machine learning, including (i) a
+generalization bound for empirical risk minimization with Markovian samples,
+(ii) a finite sample guarantee for Ployak-Ruppert averaging of SGD, and (iii) a
+new regret bound for rested Markovian bandits with general state space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Large Language Models on Climate Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannis Bulian, Mike S. Schäfer, Afra Amini, Heidi Lam, Massimiliano Ciaramita, Ben Gaiarin, Michelle Chen Huebscher, Christian Buck, Niels Mede, Markus Leippold, Nadine Strauss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how climate change affects us and learning about available
+solutions are key steps toward empowering individuals and communities to
+mitigate and adapt to it. As Large Language Models (LLMs) rise in popularity,
+it is necessary to assess their capability in this domain. In this study, we
+present a comprehensive evaluation framework, grounded in science communication
+principles, to analyze LLM responses to climate change topics. Our framework
+emphasizes both the presentational and epistemological adequacy of answers,
+offering a fine-grained analysis of LLM generations. Spanning 8 dimensions, our
+framework discerns up to 30 distinct issues in model outputs. The task is a
+real-world example of a growing number of challenging problems where AI can
+complement and lift human performance. We introduce a novel and practical
+protocol for scalable oversight that uses AI Assistance and relies on raters
+with relevant educational backgrounds. We evaluate several recent LLMs and
+conduct a comprehensive analysis of the results, shedding light on both the
+potential and the limitations of LLMs in the realm of climate communication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph data modelling for outcome prediction in oropharyngeal cancer
+  patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nithya Bhasker, Stefan Leger, Alexander Zwanenburg, Chethan Babu Reddy, Sebastian Bodenstedt, Steffen Löck, Stefanie Speidel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are becoming increasingly popular in the medical
+domain for the tasks of disease classification and outcome prediction. Since
+patient data is not readily available as a graph, most existing methods either
+manually define a patient graph, or learn a latent graph based on pairwise
+similarities between the patients. There are also hypergraph neural network
+(HGNN)-based methods that were introduced recently to exploit potential higher
+order associations between the patients by representing them as a hypergraph.
+In this work, we propose a patient hypergraph network (PHGN), which has been
+investigated in an inductive learning setup for binary outcome prediction in
+oropharyngeal cancer (OPC) patients using computed tomography (CT)-based
+radiomic features for the first time. Additionally, the proposed model was
+extended to perform time-to-event analyses, and compared with GNN and baseline
+linear models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Transport with Adaptive Regularisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugues Van Assel, Titouan Vayer, Remi Flamary, Nicolas Courty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regularising the primal formulation of optimal transport (OT) with a strictly
+convex term leads to enhanced numerical complexity and a denser transport plan.
+Many formulations impose a global constraint on the transport plan, for
+instance by relying on entropic regularisation. As it is more expensive to
+diffuse mass for outlier points compared to central ones, this typically
+results in a significant imbalance in the way mass is spread across the points.
+This can be detrimental for some applications where a minimum of smoothing is
+required per point. To remedy this, we introduce OT with Adaptive
+RegularIsation (OTARI), a new formulation of OT that imposes constraints on the
+mass going in or/and out of each point. We then showcase the benefits of this
+approach for domain adaptation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Ayurvedic Diagnosis using Multinomial Naive Bayes and K-modes
+  Clustering: An Investigation into Prakriti Types and Dosha Overlapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Bidve, Shalini Mishra, Annapurna J
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The identification of Prakriti types for the human body is a long-lost
+medical practice in finding the harmony between the nature of human beings and
+their behaviour. There are 3 fundamental Prakriti types of individuals. A
+person can belong to any Dosha. In the existing models, researchers have made
+use of SVM, KNN, PCA, Decision Tree, and various other algorithms. The output
+of these algorithms was quite decent, but it can be enhanced with the help of
+Multinomial Naive Bayes and K-modes clustering. Most of the researchers have
+confined themselves to 3 basic classes. This might not be accurate in the
+real-world scenario, where overlapping might occur. Considering these, we have
+classified the Doshas into 7 categories, which includes overlapping of Doshas.
+These are namely, VATT-Dosha, PITT-Dosha, KAPH-Dosha, VATT-PITT-Dosha,
+PITT-KAPH-Dosha, KAPH-VATT-Dosha, and VATT-PITT-KAPH-Dosha. The data used
+contains a balanced set of all individual entries on which preprocessing steps
+of machine learning have been performed. Chi-Square test for handling
+categorical data is being used for feature selection. For model fitting, the
+method used in this approach is K-modes clustering. The empirical results
+demonstrate a better result while using the MNB classifier. All key findings of
+this work have achieved 0.90 accuracy, 0.81 precision, 0.91 F-score, and 0.90
+recall. The discussion suggests a provident analysis of the seven clusters and
+predicts their occurrence. The results have been consolidated to improve the
+Ayurvedic advancements with machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-based Multi-task Learning for Base Editor Outcome Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amina Mollaysa, Ahmed Allam, Michael Krauthammer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human genetic diseases often arise from point mutations, emphasizing the
+critical need for precise genome editing techniques. Among these, base editing
+stands out as it allows targeted alterations at the single nucleotide level.
+However, its clinical application is hindered by low editing efficiency and
+unintended mutations, necessitating extensive trial-and-error experimentation
+in the laboratory. To speed up this process, we present an attention-based
+two-stage machine learning model that learns to predict the likelihood of all
+possible editing outcomes for a given genomic target sequence. We further
+propose a multi-task learning schema to jointly learn multiple base editors
+(i.e. variants) at once. Our model's predictions consistently demonstrated a
+strong correlation with the actual experimental results on multiple datasets
+and base editor variants. These results provide further validation for the
+models' capacity to enhance and accelerate the process of refining base editing
+designs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELUQuant: Event-Level Uncertainty Quantification in Deep Inelastic
+  Scattering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristiano Fanelli, James Giroux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a physics-informed Bayesian Neural Network (BNN) with flow
+approximated posteriors using multiplicative normalizing flows (MNF) for
+detailed uncertainty quantification (UQ) at the physics event-level. Our method
+is capable of identifying both heteroskedastic aleatoric and epistemic
+uncertainties, providing granular physical insights. Applied to Deep Inelastic
+Scattering (DIS) events, our model effectively extracts the kinematic variables
+$x$, $Q^2$, and $y$, matching the performance of recent deep learning
+regression techniques but with the critical enhancement of event-level UQ. This
+detailed description of the underlying uncertainty proves invaluable for
+decision-making, especially in tasks like event filtering. It also allows for
+the reduction of true inaccuracies without directly accessing the ground truth.
+A thorough DIS simulation using the H1 detector at HERA indicates possible
+applications for the future EIC. Additionally, this paves the way for related
+tasks such as data quality monitoring and anomaly detection. Remarkably, our
+approach effectively processes large samples at high rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spline-based neural network interatomic potentials: blending classical
+  and machine learning models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua A. Vita, Dallas R. Trinkle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While machine learning (ML) interatomic potentials (IPs) are able to achieve
+accuracies nearing the level of noise inherent in the first-principles data to
+which they are trained, it remains to be shown if their increased complexities
+are strictly necessary for constructing high-quality IPs. In this work, we
+introduce a new MLIP framework which blends the simplicity of spline-based MEAM
+(s-MEAM) potentials with the flexibility of a neural network (NN) architecture.
+The proposed framework, which we call the spline-based neural network potential
+(s-NNP), is a simplified version of the traditional NNP that can be used to
+describe complex datasets in a computationally efficient manner. We demonstrate
+how this framework can be used to probe the boundary between classical and ML
+IPs, highlighting the benefits of key architectural changes. Furthermore, we
+show that using spline filters for encoding atomic environments results in a
+readily interpreted embedding layer which can be coupled with modifications to
+the NN to incorporate expected physical behaviors and improve overall
+interpretability. Finally, we test the flexibility of the spline filters,
+observing that they can be shared across multiple chemical systems in order to
+provide a convenient reference point from which to begin performing
+cross-system analyses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FroSSL: Frobenius Norm Minimization for <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Skean, Aayush Dhakal, Nathan Jacobs, Luis Gonzalo Sanchez Giraldo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) is an increasingly popular paradigm for
+representation learning. Recent methods can be classified as
+sample-contrastive, dimension-contrastive, or asymmetric network-based, with
+each family having its own approach to avoiding informational collapse. While
+dimension-contrastive methods converge to similar solutions as
+sample-contrastive methods, it can be empirically shown that some methods
+require more epochs of training to converge. Motivated by closing this divide,
+we present the objective function FroSSL which is both sample- and
+dimension-contrastive up to embedding normalization. FroSSL works by minimizing
+covariance Frobenius norms for avoiding collapse and minimizing mean-squared
+error for augmentation invariance. We show that FroSSL converges more quickly
+than a variety of other SSL methods and provide theoretical and empirical
+support that this faster convergence is due to how FroSSL affects the
+eigenvalues of the embedding covariance matrices. We also show that FroSSL
+learns competitive representations on linear probe evaluation when used to
+train a ResNet18 on the CIFAR-10, CIFAR-100, STL-10, and ImageNet datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Searching for High-Value Molecules Using Reinforcement Learning and
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raj Ghugare, Santiago Miret, Adriana Hugessen, Mariano Phielipp, Glen Berseth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) over text representations can be effective for
+finding high-value policies that can search over graphs. However, RL requires
+careful structuring of the search space and algorithm design to be effective in
+this challenge. Through extensive experiments, we explore how different design
+choices for text grammar and algorithmic choices for training can affect an RL
+policy's ability to generate molecules with desired properties. We arrive at a
+new RL-based molecular design algorithm (ChemRLformer) and perform a thorough
+analysis using 25 molecule design tasks, including computationally complex
+protein docking simulations. From this analysis, we discover unique insights in
+this problem space and show that ChemRLformer achieves state-of-the-art
+performance while being more straightforward than prior work by demystifying
+which design choices are actually helpful for text-based molecule design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recovery of Training Data from Overparameterized Autoencoders: An
+  Inverse Problem Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koren Abitbul, Yehuda Dar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the recovery of training data from overparameterized autoencoder
+models. Given a degraded training sample, we define the recovery of the
+original sample as an inverse problem and formulate it as an optimization task.
+In our inverse problem, we use the trained autoencoder to implicitly define a
+regularizer for the particular training dataset that we aim to retrieve from.
+We develop the intricate optimization task into a practical method that
+iteratively applies the trained autoencoder and relatively simple computations
+that estimate and address the unknown degradation operator. We evaluate our
+method for blind inpainting where the goal is to recover training images from
+degradation of many missing pixels in an unknown pattern. We examine various
+deep autoencoder architectures, such as fully connected and U-Net (with various
+nonlinearities and at diverse train loss values), and show that our method
+significantly outperforms previous methods for training data recovery from
+autoencoders. Importantly, our method greatly improves the recovery performance
+also in settings that were previously considered highly challenging, and even
+impractical, for such retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoLiDE: Concomitant Linear DAG Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Saman Saboksayr, Gonzalo Mateos, Mariano Tepper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We deal with the combinatorial problem of learning directed acyclic graph
+(DAG) structure from observational data adhering to a linear structural
+equation model (SEM). Leveraging advances in differentiable, nonconvex
+characterizations of acyclicity, recent efforts have advocated a continuous
+constrained optimization paradigm to efficiently explore the space of DAGs.
+Most existing methods employ lasso-type score functions to guide this search,
+which (i) require expensive penalty parameter retuning when the
+$\textit{unknown}$ SEM noise variances change across problem instances; and
+(ii) implicitly rely on limiting homoscedasticity assumptions. In this work, we
+propose a new convex score function for sparsity-aware learning of linear DAGs,
+which incorporates concomitant estimation of scale and thus effectively
+decouples the sparsity parameter from the exogenous noise levels.
+Regularization via a smooth, nonconvex acyclicity penalty term yields CoLiDE
+($\textbf{Co}$ncomitant $\textbf{Li}$near $\textbf{D}$AG
+$\textbf{E}$stimation), a regression-based criterion amenable to efficient
+gradient computation and closed-form estimation of noise variances in
+heteroscedastic scenarios. Our algorithm outperforms state-of-the-art methods
+without incurring added complexity, especially when the DAGs are larger and the
+noise level profile is heterogeneous. We also find CoLiDE exhibits enhanced
+stability manifested via reduced standard deviations in several domain-specific
+metrics, underscoring the robustness of our novel linear DAG estimator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Something for (almost) nothing: Improving deep ensemble calibration
+  using unlabeled data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantinos Pitas, Julyan Arbel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method to improve the calibration of deep ensembles in the small
+training data regime in the presence of unlabeled data. Our approach is
+extremely simple to implement: given an unlabeled set, for each unlabeled data
+point, we simply fit a different randomly selected label with each ensemble
+member. We provide a theoretical analysis based on a PAC-Bayes bound which
+guarantees that if we fit such a labeling on unlabeled data, and the true
+labels on the training data, we obtain low negative log-likelihood and high
+ensemble diversity on testing samples. Empirically, through detailed
+experiments, we find that for low to moderately-sized training sets, our
+ensembles are more diverse and provide better calibration than standard
+ensembles, sometimes significantly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stationarity without mean reversion: Improper Gaussian process
+  regression and improper kernels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Ambrogioni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian processes (GP) regression has gained substantial popularity in
+machine learning applications. The behavior of a GP regression depends on the
+choice of covariance function. Stationary covariance functions are favorite in
+machine learning applications. However, (non-periodic) stationary covariance
+functions are always mean reverting and can therefore exhibit pathological
+behavior when applied to data that does not relax to a fixed global mean value.
+In this paper, we show that it is possible to use improper GP prior with
+infinite variance to define processes that are stationary but not mean
+reverting. To this aim, we introduce a large class of improper kernels that can
+only be defined in this improper regime. Specifically, we introduce the Smooth
+Walk kernel, which produces infinitely smooth samples, and a family of improper
+Mat\'ern kernels, which can be defined to be $j$-times differentiable for any
+integer $j$. The resulting posterior distributions can be computed analytically
+and it involves a simple correction of the usual formulas. By analyzing both
+synthetic and real data, we demonstrate that these improper kernels solve some
+known pathologies of mean reverting GP regression while retaining most of the
+favourable properties of ordinary smooth stationary kernels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recent Methodological Advances in Federated Learning for Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Zhang, Daniel Kreuter, Yichen Chen, Sören Dittmer, Samuel Tull, Tolou Shadbahr, BloodCounts! Collaboration, Jacobus Preller, James H. F. Rudd, John A. D. Aston, Carola-Bibiane Schönlieb, Nicholas Gleadall, Michael Roberts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For healthcare datasets, it is often not possible to combine data samples
+from multiple sites due to ethical, privacy or logistical concerns. Federated
+learning allows for the utilisation of powerful machine learning algorithms
+without requiring the pooling of data. Healthcare data has many simultaneous
+challenges which require new methodologies to address, such as highly-siloed
+data, class imbalance, missing data, distribution shifts and non-standardised
+variables. Federated learning adds significant methodological complexity to
+conventional centralised machine learning, requiring distributed optimisation,
+communication between nodes, aggregation of models and redistribution of
+models. In this systematic review, we consider all papers on Scopus that were
+published between January 2015 and February 2023 and which describe new
+federated learning methodologies for addressing challenges with healthcare
+data. We performed a detailed review of the 89 papers which fulfilled these
+criteria. Significant systemic issues were identified throughout the literature
+which compromise the methodologies in many of the papers reviewed. We give
+detailed recommendations to help improve the quality of the methodology
+development for federated learning in healthcare.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supplementary table of extracted data at the end of the document</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stable and Interpretable Deep Learning for Tabular Data: Introducing
+  InterpreTabNet with the Novel InterpreStability Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyun Wa, Xinai Lu, Minjuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Artificial Intelligence (AI) integrates deeper into diverse sectors, the
+quest for powerful models has intensified. While significant strides have been
+made in boosting model capabilities and their applicability across domains, a
+glaring challenge persists: many of these state-of-the-art models remain as
+black boxes. This opacity not only complicates the explanation of model
+decisions to end-users but also obstructs insights into intermediate processes
+for model designers. To address these challenges, we introduce InterpreTabNet,
+a model designed to enhance both classification accuracy and interpretability
+by leveraging the TabNet architecture with an improved attentive module. This
+design ensures robust gradient propagation and computational stability.
+Additionally, we present a novel evaluation metric, InterpreStability, which
+quantifies the stability of a model's interpretability. The proposed model and
+metric mark a significant stride forward in explainable models' research,
+setting a standard for transparency and interpretability in AI model design and
+application across diverse sectors. InterpreTabNet surpasses other leading
+solutions in tabular data analysis across varied application scenarios, paving
+the way for further research into creating deep-learning models that are both
+highly accurate and inherently explainable. The introduction of the
+InterpreStability metric ensures that the interpretability of future models can
+be measured and compared in a consistent and rigorous manner. Collectively,
+these contributions have the potential to promote the design principles and
+development of next-generation interpretable AI models, widening the adoption
+of interpretable AI solutions in critical decision-making environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 7 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harmonic Control Lyapunov Barrier Functions for Constrained Optimal
+  Control with Reach-Avoid Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amartya Mukherjee, Ruikun Zhou, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces harmonic control Lyapunov barrier functions (harmonic
+CLBF) that aid in constrained control problems such as reach-avoid problems.
+Harmonic CLBFs exploit the maximum principle that harmonic functions satisfy to
+encode the properties of control Lyapunov barrier functions (CLBFs). As a
+result, they can be initiated at the start of an experiment rather than trained
+based on sample trajectories. The control inputs are selected to maximize the
+inner product of the system dynamics with the steepest descent direction of the
+harmonic CLBF. Numerical results are presented with four different systems
+under different reach-avoid environments. Harmonic CLBFs show a significantly
+low risk of entering unsafe regions and a high probability of entering the goal
+region.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation of Models with Limited Data by Leveraging Shared Structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maryann Rui, Thibaut Horel, Munther Dahleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern data sets, such as those in healthcare and e-commerce, are often
+derived from many individuals or systems but have insufficient data from each
+source alone to separately estimate individual, often high-dimensional, model
+parameters. If there is shared structure among systems however, it may be
+possible to leverage data from other systems to help estimate individual
+parameters, which could otherwise be non-identifiable. In this paper, we assume
+systems share a latent low-dimensional parameter space and propose a method for
+recovering $d$-dimensional parameters for $N$ different linear systems, even
+when there are only $T<d$ observations per system. To do so, we develop a
+three-step algorithm which estimates the low-dimensional subspace spanned by
+the systems' parameters and produces refined parameter estimates within the
+subspace. We provide finite sample subspace estimation error guarantees for our
+proposed method. Finally, we experimentally validate our method on simulations
+with i.i.d. regression data and as well as correlated time series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Conference on Decision and Control (CDC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal Predictions for Longitudinal Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devesh Batra, Salvatore Mercuri, Raad Khraishi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Longitudinal Predictive Conformal Inference (LPCI), a novel
+distribution-free conformal prediction algorithm for longitudinal data. Current
+conformal prediction approaches for time series data predominantly focus on the
+univariate setting, and thus lack cross-sectional coverage when applied
+individually to each time series in a longitudinal dataset. The current
+state-of-the-art for longitudinal data relies on creating infinitely-wide
+prediction intervals to guarantee both cross-sectional and asymptotic
+longitudinal coverage. The proposed LPCI method addresses this by ensuring that
+both longitudinal and cross-sectional coverages are guaranteed without
+resorting to infinitely wide intervals. In our approach, we model the residual
+data as a quantile fixed-effects regression problem, constructing prediction
+intervals with a trained quantile regressor. Our extensive experiments
+demonstrate that LPCI achieves valid cross-sectional coverage and outperforms
+existing benchmarks in terms of longitudinal coverage rates. Theoretically, we
+establish LPCI's asymptotic coverage guarantees for both dimensions, with
+finite-width intervals. The robust performance of LPCI in generating reliable
+prediction intervals for longitudinal data underscores its potential for broad
+applications, including in medicine, finance, and supply chain management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A novel asymmetrical autoencoder with a sparsifying discrete cosine
+  Stockwell transform layer for gearbox sensor data compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhu, Daoguang Yang, Hongyi Pan, Hamid Reza Karimi, Didem Ozevin, Ahmet Enis Cetin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of an efficient compression model remains a challenge for the
+wireless transmission of gearbox data in non-contact gear fault diagnosis
+problems. In this paper, we present a signal-adaptive asymmetrical autoencoder
+with a transform domain layer to compress sensor signals. First, a new discrete
+cosine Stockwell transform (DCST) layer is introduced to replace linear layers
+in a multi-layer autoencoder. A trainable filter is implemented in the DCST
+domain by utilizing the multiplication property of the convolution. A trainable
+hard-thresholding layer is applied to reduce redundant data in the DCST layer
+to make the feature map sparse. In comparison to the linear layer, the DCST
+layer reduces the number of trainable parameters and improves the accuracy of
+data reconstruction. Second, training the autoencoder with a sparsifying DCST
+layer only requires a small number of datasets. The proposed method is superior
+to other autoencoder-based methods on the University of Connecticut (UoC) and
+Southeast University (SEU) gearbox datasets, as the average quality score is
+improved by 2.00% at the lowest and 32.35% at the highest with a limited number
+of training samples
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rayleigh Quotient Graph Neural Networks for Graph-level Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Dong, Xingyi Zhang, Sibo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-level anomaly detection has gained significant attention as it finds
+many applications in various domains, such as cancer diagnosis and enzyme
+prediction. However, existing methods fail to capture the underlying properties
+of graph anomalies, resulting in unexplainable framework design and
+unsatisfying performance. In this paper, we take a step back and re-investigate
+the spectral differences between anomalous and normal graphs. Our main
+observation shows a significant disparity in the accumulated spectral energy
+between these two classes. Moreover, we prove that the accumulated spectral
+energy of the graph signal can be represented by its Rayleigh Quotient,
+indicating that the Rayleigh Quotient is a driving factor behind the anomalous
+properties of graphs. Motivated by this, we propose Rayleigh Quotient Graph
+Neural Network (RQGNN), the first spectral GNN for graph-level anomaly
+detection, providing a new perspective on exploring the inherent spectral
+features of anomalous graphs. Specifically, we introduce a novel framework that
+consists of two components: the Rayleigh Quotient learning component (RQL) and
+Chebyshev Wavelet GNN with RQ-pooling (CWGNN-RQ). RQL explicitly captures the
+Rayleigh Quotient of graphs and CWGNN-RQ implicitly explores the spectral space
+of graphs. Extensive experiments on 10 real-world datasets show that RQGNN
+outperforms the best rival by 6.74% in Macro-F1 score and 1.44% in AUC,
+demonstrating the effectiveness of our framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Domain Causal Representation Learning via Weak Distributional
+  Invariances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kartik Ahuja, Amin Mansouri, Yixin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal representation learning has emerged as the center of action in causal
+machine learning research. In particular, multi-domain datasets present a
+natural opportunity for showcasing the advantages of causal representation
+learning over standard unsupervised representation learning. While recent works
+have taken crucial steps towards learning causal representations, they often
+lack applicability to multi-domain datasets due to over-simplifying assumptions
+about the data; e.g. each domain comes from a different single-node perfect
+intervention. In this work, we relax these assumptions and capitalize on the
+following observation: there often exists a subset of latents whose certain
+distributional properties (e.g., support, variance) remain stable across
+domains; this property holds when, for example, each domain comes from a
+multi-node imperfect intervention. Leveraging this observation, we show that
+autoencoders that incorporate such invariances can provably identify the stable
+set of latents from the rest across different settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Out-of-Distribution Detection by Leveraging Between-Layer Transformation
+  Smoothness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fran Jelenić, Josip Jukić, Martin Tutek, Mate Puljiz, Jan Šnajder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective OOD detection is crucial for reliable machine learning models, yet
+most current methods are limited in practical use due to requirements like
+access to training data or intervention in training. We present a novel method
+for detecting OOD data in deep neural networks based on transformation
+smoothness between intermediate layers of a network (BLOOD), which is
+applicable to pre-trained models without access to training data. BLOOD
+utilizes the tendency of between-layer representation transformations of
+in-distribution (ID) data to be smoother than the corresponding transformations
+of OOD data, a property that we also demonstrate empirically for Transformer
+networks. We evaluate BLOOD on several text classification tasks with
+Transformer networks and demonstrate that it outperforms methods with
+comparable resource requirements. Our analysis also suggests that when learning
+simpler tasks, OOD data transformations maintain their original sharpness,
+whereas sharpness increases with more complex tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Learning to Scale Logits for Temperature-Conditional GFlowNets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Joohwan Ko, Dinghuai Zhang, Ling Pan, Taeyoung Yun, Woochang Kim, Jinkyoo Park, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GFlowNets are probabilistic models that learn a stochastic policy that
+sequentially generates compositional structures, such as molecular graphs. They
+are trained with the objective of sampling such objects with probability
+proportional to the object's reward. Among GFlowNets, the
+temperature-conditional GFlowNets represent a family of policies indexed by
+temperature, and each is associated with the correspondingly tempered reward
+function. The major benefit of temperature-conditional GFlowNets is the
+controllability of GFlowNets' exploration and exploitation through adjusting
+temperature. We propose Learning to Scale Logits for temperature-conditional
+GFlowNets (LSL-GFN), a novel architectural design that greatly accelerates the
+training of temperature-conditional GFlowNets. It is based on the idea that
+previously proposed temperature-conditioning approaches introduced numerical
+challenges in the training of the deep network because different temperatures
+may give rise to very different gradient profiles and ideal scales of the
+policy's logits. We find that the challenge is greatly reduced if a learned
+function of the temperature is used to scale the policy's logits directly. We
+empirically show that our strategy dramatically improves the performances of
+GFlowNets, outperforming other baselines, including reinforcement learning and
+sampling methods, in terms of discovering diverse modes in multiple biochemical
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time-Series Classification in Smart Manufacturing Systems: An
+  Experimental Evaluation of State-of-the-Art Machine Learning Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02812v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02812v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mojtaba A. Farahani, M. R. McCormick, Ramy Harik, Thorsten Wuest
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manufacturing is gathering extensive amounts of diverse data, thanks to the
+growing number of sensors and rapid advances in sensing technologies. Among the
+various data types available in SMS settings, time-series data plays a pivotal
+role. Hence, TSC emerges is crucial in this domain. The objective of this study
+is to fill this gap by providing a rigorous experimental evaluation of the SoTA
+ML and DL algorithms for TSC tasks in manufacturing and industrial settings. We
+first explored and compiled a comprehensive list of more than 92 SoTA
+algorithms from both TSC and manufacturing literature. Following, we selected
+the 36 most representative algorithms from this list. To evaluate their
+performance across various manufacturing classification tasks, we curated a set
+of 22 manufacturing datasets, representative of different characteristics that
+cover diverse manufacturing problems. Subsequently, we implemented and
+evaluated the algorithms on the manufacturing benchmark datasets, and analyzed
+the results for each dataset. Based on the results, ResNet, DrCIF,
+InceptionTime, and ARSENAL are the top-performing algorithms, boasting an
+average accuracy of over 96.6% across all 22 manufacturing TSC datasets. These
+findings underscore the robustness, efficiency, scalability, and effectiveness
+of convolutional kernels in capturing temporal features in time-series data, as
+three out of the top four performing algorithms leverage these kernels for
+feature extraction. Additionally, LSTM, BiLSTM, and TS-LSTM algorithms deserve
+recognition for their effectiveness in capturing features within time-series
+data using RNN-based structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the Journal of Manufacturing systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Instance Generative Framework for MILP Solvers Under Limited Data
+  Availability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Geng, Xijun Li, Jie Wang, Xiao Li, Yongdong Zhang, Feng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past few years, there has been an explosive surge in the use of
+machine learning (ML) techniques to address combinatorial optimization (CO)
+problems, especially mixed-integer linear programs (MILPs). Despite the
+achievements, the limited availability of real-world instances often leads to
+sub-optimal decisions and biased solver assessments, which motivates a suite of
+synthetic MILP instance generation techniques. However, existing methods either
+rely heavily on expert-designed formulations or struggle to capture the rich
+features of real-world instances. To tackle this problem, we propose G2MILP,
+which to the best of our knowledge is the first deep generative framework for
+MILP instances. Specifically, G2MILP represents MILP instances as bipartite
+graphs, and applies a masked variational autoencoder to iteratively corrupt and
+replace parts of the original graphs to generate new ones. The appealing
+feature of G2MILP is that it can learn to generate novel and realistic MILP
+instances without prior expert-designed formulations, while preserving the
+structures and computational hardness of real-world datasets, simultaneously.
+Thus the generated instances can facilitate downstream tasks for enhancing MILP
+solvers under limited data availability. We design a suite of benchmarks to
+evaluate the quality of the generated MILP instances. Experiments demonstrate
+that our method can produce instances that closely resemble real-world datasets
+in terms of both structures and computational hardness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Data-facilitated Numerical Method for Richards Equation to Model Water
+  Flow Dynamics in Soil 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Song, Zheyu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Root-zone soil moisture monitoring is essential for precision agriculture,
+smart irrigation, and drought prevention. Modeling the spatiotemporal water
+flow dynamics in soil is typically achieved by solving a hydrological model,
+such as the Richards equation which is a highly nonlinear partial differential
+equation (PDE). In this paper, we present a novel data-facilitated numerical
+method for solving the mixed-form Richards equation. This numerical method,
+which we call the D-GRW (Data-facilitated global Random Walk) method,
+synergistically integrates adaptive linearization scheme, neural networks, and
+global random walk in a finite volume discretization framework to produce
+accurate numerical solutions of the Richards equation with guaranteed
+convergence under reasonable assumptions. Through three illustrative examples,
+we demonstrate and discuss the superior accuracy and mass conservation
+performance of our D-GRW method and compare it with benchmark numerical methods
+and commercial solver.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 11 figures, submitted to Water Resources Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOMINO: A Dual-System for Multi-step Visual Language Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peifang Wang, Olga Golovneva, Armen Aghajanyan, Xiang Ren, Muhao Chen, Asli Celikyilmaz, Maryam Fazel-Zarandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual language reasoning requires a system to extract text or numbers from
+information-dense images like charts or plots and perform logical or arithmetic
+reasoning to arrive at an answer. To tackle this task, existing work relies on
+either (1) an end-to-end vision-language model trained on a large amount of
+data, or (2) a two-stage pipeline where a captioning model converts the image
+into text that is further read by another large language model to deduce the
+answer. However, the former approach forces the model to answer a complex
+question with one single step, and the latter approach is prone to inaccurate
+or distracting information in the converted text that can confuse the language
+model. In this work, we propose a dual-system for multi-step multimodal
+reasoning, which consists of a "System-1" step for visual information
+extraction and a "System-2" step for deliberate reasoning. Given an input,
+System-2 breaks down the question into atomic sub-steps, each guiding System-1
+to extract the information required for reasoning from the image. Experiments
+on chart and plot datasets show that our method with a pre-trained System-2
+module performs competitively compared to prior work on in- and
+out-of-distribution data. By fine-tuning the System-2 module (LLaMA-2 70B) on
+only a small amount of data on multi-step reasoning, the accuracy of our method
+is further improved and surpasses the best fully-supervised end-to-end approach
+by 5.7% and a pipeline approach with FlanPaLM (540B) by 7.5% on a challenging
+dataset with human-authored questions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAD Max Beyond Single-Node: Enabling Large Machine Learning Model
+  Acceleration on Distributed Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Hsia, Alicia Golden, Bilge Acun-Uyan, Newsha Ardalani, Zachary DeVito, Gu-Yeon Wei, David Brooks, Carole-Jean Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training and deploying large machine learning (ML) models is time-consuming
+and requires significant distributed computing infrastructures. Based on
+real-world large model training on datacenter-scale infrastructures, we show
+14~32% of all GPU hours are spent on communication with no overlapping
+computation. To minimize the outstanding communication latency, in this work,
+we develop an agile performance modeling framework to guide parallelization and
+hardware-software co-design strategies. Using the suite of real-world large ML
+models on state-of-the-art GPU training hardware, we demonstrate 2.24x and
+5.27x throughput improvement potential for pre-training and inference
+scenarios, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering General Reinforcement Learning Algorithms with Adversarial
+  Environment Design <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Thomas Jackson, Minqi Jiang, Jack Parker-Holder, Risto Vuorio, Chris Lu, Gregory Farquhar, Shimon Whiteson, Jakob Nicolaus Foerster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past decade has seen vast progress in deep reinforcement learning (RL) on
+the back of algorithms manually designed by human researchers. Recently, it has
+been shown that it is possible to meta-learn update rules, with the hope of
+discovering algorithms that can perform well on a wide range of RL tasks.
+Despite impressive initial results from algorithms such as Learned Policy
+Gradient (LPG), there remains a generalization gap when these algorithms are
+applied to unseen environments. In this work, we examine how characteristics of
+the meta-training distribution impact the generalization performance of these
+algorithms. Motivated by this analysis and building on ideas from Unsupervised
+Environment Design (UED), we propose a novel approach for automatically
+generating curricula to maximize the regret of a meta-learned optimizer, in
+addition to a novel approximation of regret, which we name algorithmic regret
+(AR). The result is our method, General RL Optimizers Obtained Via Environment
+Design (GROOVE). In a series of experiments, we show that GROOVE achieves
+superior generalization to LPG, and evaluate AR against baseline metrics from
+UED, identifying it as a critical component of environment design in this
+setting. We believe this approach is a step towards the discovery of truly
+general RL algorithms, capable of solving a wide range of real-world
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Expected flow networks in stochastic environments and two-player
+  zero-sum games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Jiralerspong, Bilun Sun, Danilo Vucetic, Tianyu Zhang, <span class="highlight-author">Yoshua Bengio</span>, Gauthier Gidel, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative flow networks (GFlowNets) are sequential sampling models trained
+to match a given distribution. GFlowNets have been successfully applied to
+various structured object generation tasks, sampling a diverse set of
+high-reward objects quickly. We propose expected flow networks (EFlowNets),
+which extend GFlowNets to stochastic environments. We show that EFlowNets
+outperform other GFlowNet formulations in stochastic tasks such as protein
+design. We then extend the concept of EFlowNets to adversarial environments,
+proposing adversarial flow networks (AFlowNets) for two-player zero-sum games.
+We show that AFlowNets learn to find above 80% of optimal moves in Connect-4
+via self-play and outperform AlphaZero in tournaments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Networks and Time Series as Directed Graphs for Quality
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelica Simonetti, Ferdinando Zanchetta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are becoming central in the study of time
+series, coupled with existing algorithms as Temporal Convolutional Networks and
+Recurrent Neural Networks. In this paper, we see time series themselves as
+directed graphs, so that their topology encodes time dependencies and we start
+to explore the effectiveness of GNNs architectures on them. We develop two
+distinct Geometric Deep Learning models, a supervised classifier and an
+autoencoder-like model for signal reconstruction. We apply these models on a
+quality recognition problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, Comments Welcome!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernel-based function learning in dynamic and non stationary
+  environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Giaretta, Mauro Bisiacco, Gianluigi Pillonetto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One central theme in machine learning is function estimation from sparse and
+noisy data. An example is supervised learning where the elements of the
+training set are couples, each containing an input location and an output
+response. In the last decades, a substantial amount of work has been devoted to
+design estimators for the unknown function and to study their convergence to
+the optimal predictor, also characterizing the learning rate. These results
+typically rely on stationary assumptions where input locations are drawn from a
+probability distribution that does not change in time. In this work, we
+consider kernel-based ridge regression and derive convergence conditions under
+non stationary distributions, addressing also cases where stochastic adaption
+may happen infinitely often. This includes the important
+exploration-exploitation problems where e.g. a set of agents/robots has to
+monitor an environment to reconstruct a sensorial field and their movements
+rules are continuously updated on the basis of the acquired knowledge on the
+field and/or the surrounding environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Study and Framework for Automated Summariser Evaluation:
+  LangChain and Hybrid Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bagiya Lakshmi S, Sanjjushri Varshini R, Rohith Mahadevan, Raja CSP Raman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Essay Score (AES) is proven to be one of the cutting-edge
+technologies. Scoring techniques are used for various purposes. Reliable scores
+are calculated based on influential variables. Such variables can be computed
+by different methods based on the domain. The research is concentrated on the
+user's understanding of a given topic. The analysis is based on a scoring index
+by using Large Language Models. The user can then compare and contrast the
+understanding of a topic that they recently learned. The results are then
+contributed towards learning analytics and progression is made for enhancing
+the learning ability. In this research, the focus is on summarizing a PDF
+document and gauging a user's understanding of its content. The process
+involves utilizing a Langchain tool to summarize the PDF and extract the
+essential information. By employing this technique, the research aims to
+determine how well the user comprehends the summarized content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MUNCH: Modelling Unique 'N Controllable Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debayan Deb, Suvidha Tripathi, Pranit Puri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The automated generation of 3D human heads has been an intriguing and
+challenging task for computer vision researchers. Prevailing methods synthesize
+realistic avatars but with limited control over the diversity and quality of
+rendered outputs and suffer from limited correlation between shape and texture
+of the character. We propose a method that offers quality, diversity, control,
+and realism along with explainable network design, all desirable features to
+game-design artists in the domain. First, our proposed Geometry Generator
+identifies disentangled latent directions and generate novel and diverse
+samples. A Render Map Generator then learns to synthesize multiply high-fidelty
+physically-based render maps including Albedo, Glossiness, Specular, and
+Normals. For artists preferring fine-grained control over the output, we
+introduce a novel Color Transformer Model that allows semantic color control
+over generated maps. We also introduce quantifiable metrics called Uniqueness
+and Novelty and a combined metric to test the overall performance of our model.
+Demo for both shapes and textures can be found:
+https://munch-seven.vercel.app/. We will release our model along with the
+synthetic dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fair Feature Selection: A Comparison of Multi-Objective Genetic
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02752v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02752v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Brookhouse, Alex Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning classifiers are widely used to make decisions with a major
+impact on people's lives (e.g. accepting or denying a loan, hiring decisions,
+etc). In such applications,the learned classifiers need to be both accurate and
+fair with respect to different groups of people, with different values of
+variables such as sex and race. This paper focuses on fair feature selection
+for classification, i.e. methods that select a feature subset aimed at
+maximising both the accuracy and the fairness of the predictions made by a
+classifier. More specifically, we compare two recently proposed Genetic
+Algorithms (GAs) for fair feature selection that are based on two different
+multi-objective optimisation approaches: (a) a Pareto dominance-based GA; and
+(b) a lexicographic optimisation-based GA, where maximising accuracy has higher
+priority than maximising fairness. Both GAs use the same measures of accuracy
+and fairness, allowing for a controlled comparison. As far as we know, this is
+the first comparison between the Pareto and lexicographic approaches for fair
+classification. The results show that, overall, the lexicographic GA
+outperformed the Pareto GA with respect to accuracy without degradation of the
+fairness of the learned classifiers. This is an important result because at
+present nearly all GAs for fair classification are based on the Pareto
+approach, so these results suggest a promising new direction for research in
+this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SHOT: Suppressing the Hessian along the Optimization Trajectory for
+  Gradient-Based Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JunHoo Lee, Jayeon Yoo, Nojun Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we hypothesize that gradient-based meta-learning (GBML)
+implicitly suppresses the Hessian along the optimization trajectory in the
+inner loop. Based on this hypothesis, we introduce an algorithm called SHOT
+(Suppressing the Hessian along the Optimization Trajectory) that minimizes the
+distance between the parameters of the target and reference models to suppress
+the Hessian in the inner loop. Despite dealing with high-order terms, SHOT does
+not increase the computational complexity of the baseline model much. It is
+agnostic to both the algorithm and architecture used in GBML, making it highly
+versatile and applicable to any GBML baseline. To validate the effectiveness of
+SHOT, we conduct empirical tests on standard few-shot learning tasks and
+qualitatively analyze its dynamics. We confirm our hypothesis empirically and
+demonstrate that SHOT outperforms the corresponding baseline. Code is available
+at: https://github.com/JunHoo-Lee/SHOT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SALSA: Semantically-Aware Latent Space Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kathryn E. Kirchoff, Travis Maxfield, Alexander Tropsha, Shawn M. Gomez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In deep learning for drug discovery, chemical data are often represented as
+simplified molecular-input line-entry system (SMILES) sequences which allow for
+straightforward implementation of natural language processing methodologies,
+one being the sequence-to-sequence autoencoder. However, we observe that
+training an autoencoder solely on SMILES is insufficient to learn molecular
+representations that are semantically meaningful, where semantics are defined
+by the structural (graph-to-graph) similarities between molecules. We
+demonstrate by example that autoencoders may map structurally similar molecules
+to distant codes, resulting in an incoherent latent space that does not respect
+the structural similarities between molecules. To address this shortcoming we
+propose Semantically-Aware Latent Space Autoencoder (SALSA), a
+transformer-autoencoder modified with a contrastive task, tailored specifically
+to learn graph-to-graph similarity between molecules. Formally, the contrastive
+objective is to map structurally similar molecules (separated by a single graph
+edit) to nearby codes in the latent space. To accomplish this, we generate a
+novel dataset comprised of sets of structurally similar molecules and opt for a
+supervised contrastive loss that is able to incorporate full sets of positive
+samples. We compare SALSA to its ablated counterparts, and show empirically
+that the composed training objective (reconstruction and contrastive task)
+leads to a higher quality latent space that is more 1) structurally-aware, 2)
+semantically continuous, and 3) property-aware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward Model Ensembles Help Mitigate Overoptimization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Coste, Usman Anwar, Robert Kirk, David Krueger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback (RLHF) is a standard approach for
+fine-tuning large language models to follow instructions. As part of this
+process, learned reward models are used to approximately model human
+preferences. However, as imperfect representations of the "true" reward, these
+learned reward models are susceptible to \textit{overoptimization}. Gao et al.
+(2023) studied this phenomenon in a synthetic human feedback setup with a
+significantly larger "gold" reward model acting as the true reward (instead of
+humans) and showed that overoptimization remains a persistent problem
+regardless of the size of the proxy reward model and training data used. Using
+a similar setup, we conduct a systematic study to evaluate the efficacy of
+using ensemble-based conservative optimization objectives, specifically
+worst-case optimization (WCO) and uncertainty-weighted optimization (UWO), for
+mitigating reward model overoptimization when using two optimization methods:
+(a) best-of-n sampling (BoN) (b) proximal policy optimization (PPO). We
+additionally extend the setup of Gao et al. (2023) to include 25% label noise
+to better mirror real-world conditions. Both with and without label noise, we
+find that conservative optimization practically eliminates overoptimization and
+improves performance by up to 70% for BoN sampling. For PPO, ensemble-based
+conservative optimization always reduces overoptimization and outperforms
+single reward model optimization. Moreover, combining it with a small KL
+penalty successfully prevents overoptimization at no performance cost. Overall,
+our results demonstrate that ensemble-based conservative optimization can
+effectively counter overoptimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 12 figures (excluding appendix). Submitted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Imbalanced Malware Byteplot Image Classification
+  using Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayasudha M, Ayesha Shaik, Gaurav Pendharkar, Soham Kumar, Muhesh Kumar B, Sudharshanan Balaji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cybersecurity is a major concern due to the increasing reliance on technology
+and interconnected systems. Malware detectors help mitigate cyber-attacks by
+comparing malware signatures. Machine learning can improve these detectors by
+automating feature extraction, identifying patterns, and enhancing dynamic
+analysis. In this paper, the performance of six multiclass classification
+models is compared on the Malimg dataset, Blended dataset, and Malevis dataset
+to gain insights into the effect of class imbalance on model performance and
+convergence. It is observed that the more the class imbalance less the number
+of epochs required for convergence and a high variance across the performance
+of different models. Moreover, it is also observed that for malware detectors
+ResNet50, EfficientNetB0, and DenseNet169 can handle imbalanced and balanced
+data well. A maximum precision of 97% is obtained for the imbalanced dataset, a
+maximum precision of 95% is obtained on the intermediate imbalance dataset, and
+a maximum precision of 95% is obtained for the perfectly balanced dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at PEIS2023 and will be published in Lecture Notes in
+  Electrical Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extracting Rules from Event Data for Study Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Majid Rafiei, Duygu Bayrak, Mahsa Pourbafrani, Gyunam Park, Hayyan Helal, Gerhard Lakemeyer, Wil M. P. van der Aalst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we examine how event data from campus management systems can
+be used to analyze the study paths of higher education students. The main goal
+is to offer valuable guidance for their study planning. We employ process and
+data mining techniques to explore the impact of sequences of taken courses on
+academic success. Through the use of decision tree models, we generate
+data-driven recommendations in the form of rules for study planning and compare
+them to the recommended study plan. The evaluation focuses on RWTH Aachen
+University computer science bachelor program students and demonstrates that the
+proposed course sequence features effectively explain academic performance
+measures. Furthermore, the findings suggest avenues for developing more
+adaptable study plans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Functional trustworthiness of AI systems by statistically valid testing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernhard Nessler, Thomas Doms, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The authors are concerned about the safety, health, and rights of the
+European citizens due to inadequate measures and procedures required by the
+current draft of the EU Artificial Intelligence (AI) Act for the conformity
+assessment of AI systems. We observe that not only the current draft of the EU
+AI Act, but also the accompanying standardization efforts in CEN/CENELEC, have
+resorted to the position that real functional guarantees of AI systems
+supposedly would be unrealistic and too complex anyways. Yet enacting a
+conformity assessment procedure that creates the false illusion of trust in
+insufficiently assessed AI systems is at best naive and at worst grossly
+negligent. The EU AI Act thus misses the point of ensuring quality by
+functional trustworthiness and correctly attributing responsibilities.
+  The trustworthiness of an AI decision system lies first and foremost in the
+correct statistical testing on randomly selected samples and in the precision
+of the definition of the application domain, which enables drawing samples in
+the first place. We will subsequently call this testable quality functional
+trustworthiness. It includes a design, development, and deployment that enables
+correct statistical testing of all relevant functions.
+  We are firmly convinced and advocate that a reliable assessment of the
+statistical functional properties of an AI system has to be the indispensable,
+mandatory nucleus of the conformity assessment. In this paper, we describe the
+three necessary elements to establish a reliable functional trustworthiness,
+i.e., (1) the definition of the technical distribution of the application, (2)
+the risk-based minimum performance requirements, and (3) the statistically
+valid testing based on independent random samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Position paper to the current regulation and standardization effort
+  of AI in Europe</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Training of a Neural HMM with Label and Transition
+  Probabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Mann, Tina Raissi, Wilfried Michel, Ralf Schlüter, Hermann Ney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate a novel modeling approach for end-to-end neural network
+training using hidden Markov models (HMM) where the transition probabilities
+between hidden states are modeled and learned explicitly. Most contemporary
+sequence-to-sequence models allow for from-scratch training by summing over all
+possible label segmentations in a given topology. In our approach there are
+explicit, learnable probabilities for transitions between segments as opposed
+to a blank label that implicitly encodes duration statistics. We implement a
+GPU-based forward-backward algorithm that enables the simultaneous training of
+label and transition probabilities. We investigate recognition results and
+additionally Viterbi alignments of our models. We find that while the
+transition model training does not improve recognition performance, it has a
+positive impact on the alignment quality. The generated alignments are shown to
+be viable targets in state-of-the-art Viterbi trainings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for Presentation at ASRU2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Temporal Graph Networks Using Module Decoupling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Feldman, Chaim Baskin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern approaches for learning on dynamic graphs have adopted the use of
+batches instead of applying updates one by one. The use of batches allows these
+techniques to become helpful in streaming scenarios where updates to graphs are
+received at extreme speeds. Using batches, however, forces the models to update
+infrequently, which results in the degradation of their performance. In this
+work, we suggest a decoupling strategy that enables the models to update
+frequently while using batches. By decoupling the core modules of temporal
+graph networks and implementing them using a minimal number of learnable
+parameters, we have developed the Lightweight Decoupled Temporal Graph Network
+(LDTGN), an exceptionally efficient model for learning on dynamic graphs. LDTG
+was validated on various dynamic graph benchmarks, providing comparable or
+state-of-the-art results with significantly higher throughput than previous
+art. Notably, our method outperforms previous approaches by more than 20\% on
+benchmarks that require rapid model update rates, such as USLegis or UNTrade.
+The code to reproduce our experiments is available at
+\href{https://orfeld415.github.io/module-decoupling}{this http url}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Pan-Sharpening via Generalized Inverse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Liu, Yutong Bai, Xinyang Han, Alan Yuille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening algorithm utilizes panchromatic image and multispectral image
+to obtain a high spatial and high spectral image. However, the optimizations of
+the algorithms are designed with different standards. We adopt the simple
+matrix equation to describe the Pan-sharpening problem. The solution existence
+condition and the acquirement of spectral and spatial resolution are discussed.
+A down-sampling enhancement method was introduced for better acquiring the
+spatial and spectral down-sample matrices. By the generalized inverse theory,
+we derived two forms of general inverse matrix formulations that can correspond
+to the two prominent classes of Pan-sharpening methods, that is, component
+substitution and multi-resolution analysis methods. Specifically, the Gram
+Schmidt Adaptive(GSA) was proved to follow the general inverse matrix
+formulation of component substitution. A model prior to the general inverse
+matrix of the spectral function was rendered. The theoretical errors are
+analyzed. Synthetic experiments and real data experiments are implemented. The
+proposed methods are better and sharper than other methods qualitatively in
+both synthetic and real experiments. The down-sample enhancement effect is
+shown of better results both quantitatively and qualitatively in real
+experiments. The generalized inverse matrix theory help us better understand
+the Pan-sharpening.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Clustering of Bandits with Misspecified User Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyong Wang, Jize Xie, Xutong Liu, Shuai Li, John C. S. Lui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The contextual linear bandit is an important online learning problem where
+given arm features, a learning agent selects an arm at each round to maximize
+the cumulative rewards in the long run. A line of works, called the clustering
+of bandits (CB), utilize the collaborative effect over user preferences and
+have shown significant improvements over classic linear bandit algorithms.
+However, existing CB algorithms require well-specified linear user models and
+can fail when this critical assumption does not hold. Whether robust CB
+algorithms can be designed for more practical scenarios with misspecified user
+models remains an open problem. In this paper, we are the first to present the
+important problem of clustering of bandits with misspecified user models
+(CBMUM), where the expected rewards in user models can be perturbed away from
+perfect linear models. We devise two robust CB algorithms, RCLUMB and RSCLUMB
+(representing the learned clustering structure with dynamic graph and sets,
+respectively), that can accommodate the inaccurate user preference estimations
+and erroneous clustering caused by model misspecifications. We prove regret
+upper bounds of $O(\epsilon_*T\sqrt{md\log T} + d\sqrt{mT}\log T)$ for our
+algorithms under milder assumptions than previous CB works (notably, we move
+past a restrictive technical assumption on the distribution of the arms), which
+match the lower bound asymptotically in $T$ up to logarithmic factors, and also
+match the state-of-the-art results in several degenerate cases. The techniques
+in proving the regret caused by misclustering users are quite general and may
+be of independent interest. Experiments on both synthetic and real-world data
+show our outperformance over previous algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ scHyena: Foundation Model for Full-Length Single-Cell RNA-Seq Analysis
+  in Brain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gyutaek Oh, Baekgyu Choi, Inkyung Jung, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-cell RNA sequencing (scRNA-seq) has made significant strides in
+unraveling the intricate cellular diversity within complex tissues. This is
+particularly critical in the brain, presenting a greater diversity of cell
+types than other tissue types, to gain a deeper understanding of brain function
+within various cellular contexts. However, analyzing scRNA-seq data remains a
+challenge due to inherent measurement noise stemming from dropout events and
+the limited utilization of extensive gene expression information. In this work,
+we introduce scHyena, a foundation model designed to address these challenges
+and enhance the accuracy of scRNA-seq analysis in the brain. Specifically,
+inspired by the recent Hyena operator, we design a novel Transformer
+architecture called singe-cell Hyena (scHyena) that is equipped with a linear
+adaptor layer, the positional encoding via gene-embedding, and a
+{bidirectional} Hyena operator. This enables us to process full-length
+scRNA-seq data without losing any information from the raw data. In particular,
+our model learns generalizable features of cells and genes through pre-training
+scHyena using the full length of scRNA-seq data. We demonstrate the superior
+performance of scHyena compared to other benchmark methods in downstream tasks,
+including cell type classification and scRNA-seq imputation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space
+  NeRF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jangho Park, Gihyun Kwon, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a significant advancement in text-to-image diffusion
+models, leading to groundbreaking performance in 2D image generation. These
+advancements have been extended to 3D models, enabling the generation of novel
+3D objects from textual descriptions. This has evolved into NeRF editing
+methods, which allow the manipulation of existing 3D objects through textual
+conditioning. However, existing NeRF editing techniques have faced limitations
+in their performance due to slow training speeds and the use of loss functions
+that do not adequately consider editing. To address this, here we present a
+novel 3D NeRF editing approach dubbed ED-NeRF by successfully embedding
+real-world scenes into the latent space of the latent diffusion model (LDM)
+through a unique refinement layer. This approach enables us to obtain a NeRF
+backbone that is not only faster but also more amenable to editing compared to
+traditional image space NeRF editing. Furthermore, we propose an improved loss
+function tailored for editing by migrating the delta denoising score (DDS)
+distillation loss, originally used in 2D image editing to the three-dimensional
+domain. This novel loss function surpasses the well-known score distillation
+sampling (SDS) loss in terms of suitability for editing purposes. Our
+experimental results demonstrate that ED-NeRF achieves faster editing speed
+while producing improved output quality compared to state-of-the-art 3D editing
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Local Search GFlowNets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Taeyoung Yun, Emmanuel Bengio, Dinghuai Zhang, <span class="highlight-author">Yoshua Bengio</span>, Sungsoo Ahn, Jinkyoo Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Flow Networks (GFlowNets) are amortized sampling methods that
+learn a distribution over discrete objects proportional to their rewards.
+GFlowNets exhibit a remarkable ability to generate diverse samples, yet
+occasionally struggle to consistently produce samples with high rewards due to
+over-exploration on wide sample space. This paper proposes to train GFlowNets
+with local search which focuses on exploiting high rewarded sample space to
+resolve this issue. Our main idea is to explore the local neighborhood via
+destruction and reconstruction guided by backward and forward policies,
+respectively. This allows biasing the samples toward high-reward solutions,
+which is not possible for a typical GFlowNet solution generation scheme which
+uses the forward policy to generate the solution from scratch. Extensive
+experiments demonstrate a remarkable performance improvement in several
+biochemical tasks. Source code is available:
+\url{https://github.com/dbsxodud-11/ls_gfn}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling Hybrid Heterogeneity on Federated Optimization via Gradient
+  Diversity Maximization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dun Zeng, Zenglin Xu, Yu Pan, Qifan Wang, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning refers to a distributed machine learning paradigm in which
+data samples are decentralized and distributed among multiple clients. These
+samples may exhibit statistical heterogeneity, which refers to data
+distributions are not independent and identical across clients. Additionally,
+system heterogeneity, or variations in the computational power of the clients,
+introduces biases into federated learning. The combined effects of statistical
+and system heterogeneity can significantly reduce the efficiency of federated
+optimization. However, the impact of hybrid heterogeneity is not rigorously
+discussed. This paper explores how hybrid heterogeneity affects federated
+optimization by investigating server-side optimization. The theoretical results
+indicate that adaptively maximizing gradient diversity in server update
+direction can help mitigate the potential negative consequences of hybrid
+heterogeneity. To this end, we introduce a novel server-side gradient-based
+optimizer \textsc{FedAWARE} with theoretical guarantees provided. Intensive
+experiments in heterogeneous federated settings demonstrate that our proposed
+optimizer can significantly enhance the performance of federated learning
+across varying degrees of hybrid heterogeneity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Federated Optimization by Reducing Variance of Adaptive
+  Unbiased Client Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dun Zeng, Zenglin Xu, Yu Pan, Qifan Wang, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) systems usually sample a fraction of clients to
+conduct a training process. Notably, the variance of global estimates for
+updating the global model built on information from sampled clients is highly
+related to federated optimization quality. This paper explores a line of "free"
+adaptive client sampling techniques in federated optimization, where the server
+builds promising sampling probability and reliable global estimates without
+requiring additional local communication and computation. We capture a minor
+variant in the sampling procedure and improve the global estimation
+accordingly. Based on that, we propose a novel sampler called K-Vib, which
+solves an online convex optimization respecting client sampling in federated
+optimization. It achieves improved a linear speed up on regret bound
+$\tilde{\mathcal{O}}\big(N^{\frac{1}{3}}T^{\frac{2}{3}}/K^{\frac{4}{3}}\big)$
+with communication budget $K$. As a result, it significantly improves the
+performance of federated optimization. Theoretical improvements and intensive
+experiments on classic federated tasks demonstrate our findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Block Term Decomposition for the Modelling of Higher-Order
+  Arrays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesper Løve Hinrich, Morten Mørup
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensors are ubiquitous in science and engineering and tensor factorization
+approaches have become important tools for the characterization of higher order
+structure. Factorizations includes the outer-product rank Canonical Polyadic
+Decomposition (CPD) as well as the multi-linear rank Tucker decomposition in
+which the Block-Term Decomposition (BTD) is a structured intermediate
+interpolating between these two representations. Whereas CPD, Tucker, and BTD
+have traditionally relied on maximum-likelihood estimation, Bayesian inference
+has been use to form probabilistic CPD and Tucker. We propose, an efficient
+variational Bayesian probabilistic BTD, which uses the von-Mises Fisher matrix
+distribution to impose orthogonality in the multi-linear Tucker parts forming
+the BTD. On synthetic and two real datasets, we highlight the Bayesian
+inference procedure and demonstrate using the proposed pBTD on noisy data and
+for model order quantification. We find that the probabilistic BTD can quantify
+suitable multi-linear structures providing a means for robust inference of
+patterns in multi-linear data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, preprint of submitted article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Ocean Subgrid-Scale Parameterizations Using Fourier Neural
+  Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Mangeleer, Gilles Louppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In climate simulations, small-scale processes shape ocean dynamics but remain
+computationally expensive to resolve directly. For this reason, their
+contributions are commonly approximated using empirical parameterizations,
+which lead to significant errors in long-term projections. In this work, we
+develop parameterizations based on Fourier Neural Operators, showcasing their
+accuracy and generalizability in comparison to other approaches. Finally, we
+discuss the potential and limitations of neural networks operating in the
+frequency domain, paving the way for future investigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Diffusion Generative Flow Samplers: Improving learning signals through
+  partial trajectory optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dinghuai Zhang, Ricky Tian Qi Chen, Cheng-Hao Liu, Aaron Courville, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We tackle the problem of sampling from intractable high-dimensional density
+functions, a fundamental task that often appears in machine learning and
+statistics. We extend recent sampling-based approaches that leverage controlled
+stochastic processes to model approximate samples from these target densities.
+The main drawback of these approaches is that the training objective requires
+full trajectories to compute, resulting in sluggish credit assignment issues
+due to use of entire trajectories and a learning signal present only at the
+terminal time. In this work, we present Diffusion Generative Flow Samplers
+(DGFS), a sampling-based framework where the learning process can be tractably
+broken down into short partial trajectory segments, via parameterizing an
+additional "flow function". Our method takes inspiration from the theory
+developed for generative flow networks (GFlowNets), allowing us to make use of
+intermediate learning signals and benefit from off-policy exploration
+capabilities. Through a variety of challenging experiments, we demonstrate that
+DGFS results in more accurate estimates of the normalization constant than
+closely-related prior methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PostRainBench: A comprehensive benchmark and a new model for
+  precipitation forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujin Tang, Jiaming Zhou, Xiang Pan, Zeying Gong, Junwei Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate precipitation forecasting is a vital challenge of both scientific
+and societal importance. Data-driven approaches have emerged as a widely used
+solution for addressing this challenge. However, solely relying on data-driven
+approaches has limitations in modeling the underlying physics, making accurate
+predictions difficult. Coupling AI-based post-processing techniques with
+traditional Numerical Weather Prediction (NWP) methods offers a more effective
+solution for improving forecasting accuracy. Despite previous post-processing
+efforts, accurately predicting heavy rainfall remains challenging due to the
+imbalanced precipitation data across locations and complex relationships
+between multiple meteorological variables. To address these limitations, we
+introduce the PostRainBench, a comprehensive multi-variable NWP post-processing
+benchmark consisting of three datasets for NWP post-processing-based
+precipitation forecasting. We propose CAMT, a simple yet effective Channel
+Attention Enhanced Multi-task Learning framework with a specially designed
+weighted loss function. Its flexible design allows for easy plug-and-play
+integration with various backbones. Extensive experimental results on the
+proposed benchmark show that our method outperforms state-of-the-art methods by
+6.3%, 4.7%, and 26.8% in rain CSI on the three datasets respectively. Most
+notably, our model is the first deep learning-based method to outperform
+traditional Numerical Weather Prediction (NWP) approaches in extreme
+precipitation conditions. It shows improvements of 15.6%, 17.4%, and 31.8% over
+NWP predictions in heavy rain CSI on respective datasets. These results
+highlight the potential impact of our model in reducing the severe consequences
+of extreme weather events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures. arXiv admin note: text overlap with
+  arXiv:2105.05537, arXiv:2206.15241 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Memorization in Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Gu, Chao Du, Tianyu Pang, Chongxuan Li, Min Lin, Ye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to their capacity to generate novel and high-quality samples, diffusion
+models have attracted significant research interest in recent years. Notably,
+the typical training objective of diffusion models, i.e., denoising score
+matching, has a closed-form optimal solution that can only generate training
+data replicating samples. This indicates that a memorization behavior is
+theoretically expected, which contradicts the common generalization ability of
+state-of-the-art diffusion models, and thus calls for a deeper understanding.
+Looking into this, we first observe that memorization behaviors tend to occur
+on smaller-sized datasets, which motivates our definition of effective model
+memorization (EMM), a metric measuring the maximum size of training data at
+which a learned diffusion model approximates its theoretical optimum. Then, we
+quantify the impact of the influential factors on these memorization behaviors
+in terms of EMM, focusing primarily on data distribution, model configuration,
+and training procedure. Besides comprehensive empirical results identifying the
+influential factors, we surprisingly find that conditioning training data on
+uninformative random labels can significantly trigger the memorization in
+diffusion models. Our study holds practical significance for diffusion model
+users and offers clues to theoretical research in deep generative models. Code
+is available at https://github.com/sail-sg/DiffMemorize.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Study of Quantisation-aware Training on Time Series <span class="highlight-title">Transformer</span> Models
+  for Resource-constrained FPGAs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianheng Ling, Chao Qian, Lukas Einhaus, Gregor Schiele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the quantisation-aware training (QAT) on time series
+Transformer models. We propose a novel adaptive quantisation scheme that
+dynamically selects between symmetric and asymmetric schemes during the QAT
+phase. Our approach demonstrates that matching the quantisation scheme to the
+real data distribution can reduce computational overhead while maintaining
+acceptable precision. Moreover, our approach is robust when applied to
+real-world data and mixed-precision quantisation, where most objects are
+quantised to 4 bits. Our findings inform model quantisation and deployment
+decisions while providing a foundation for advancing quantisation techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hire When You Need to: Gradual Participant Recruitment for Auction-based
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xavier Tan, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of federated Learning (FL) depends on the quantity and quality of
+the data owners (DOs) as well as their motivation to join FL model training.
+Reputation-based FL participant selection methods have been proposed. However,
+they still face the challenges of the cold start problem and potential
+selection bias towards highly reputable DOs. Such a bias can result in lower
+reputation DOs being prematurely excluded from future FL training rounds,
+thereby reducing the diversity of training data and the generalizability of the
+resulting models. To address these challenges, we propose the Gradual
+Participant Selection scheme for Auction-based Federated Learning (GPS-AFL).
+Unlike existing AFL incentive mechanisms which generally assume that all DOs
+required for an FL task must be selected in one go, GPS-AFL gradually selects
+the required DOs over multiple rounds of training as more information is
+revealed through repeated interactions. It is designed to strike a balance
+between cost saving and performance enhancement, while mitigating the drawbacks
+of selection bias in reputation-based FL. Extensive experiments based on
+real-world datasets demonstrate the significant advantages of GPS-AFL, which
+reduces costs by 33.65% and improved total utility by 2.91%, on average
+compared to the best-performing state-of-the-art approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundation Reinforcement Learning: towards Embodied Generalist Agents
+  with Foundation Prior Assistance <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weirui Ye, Yunsheng Zhang, Mengchen Wang, Shengjie Wang, Xianfan Gu, Pieter Abbeel, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, people have shown that large-scale pre-training from internet-scale
+data is the key to building generalist models, as witnessed in NLP. To build
+embodied generalist agents, we and many other researchers hypothesize that such
+foundation prior is also an indispensable component. However, it is unclear
+what is the proper concrete form to represent those embodied foundation priors
+and how they should be used in the downstream task. In this paper, we propose
+an intuitive and effective set of embodied priors that consist of foundation
+policy, value, and success reward. The proposed priors are based on the
+goal-conditioned MDP. To verify their effectiveness, we instantiate an
+actor-critic method assisted by the priors, called Foundation Actor-Critic
+(FAC). We name our framework as Foundation Reinforcement Learning (FRL), since
+it completely relies on embodied foundation priors to explore, learn and
+reinforce. The benefits of FRL are threefold. (1) Sample efficient. With
+foundation priors, FAC learns significantly faster than traditional RL. Our
+evaluation on the Meta-World has proved that FAC can achieve 100% success rates
+for 7/8 tasks under less than 200k frames, which outperforms the baseline
+method with careful manual-designed rewards under 1M frames. (2) Robust to
+noisy priors. Our method tolerates the unavoidable noise in embodied foundation
+models. We show that FAC works well even under heavy noise or quantization
+errors. (3) Minimal human intervention: FAC completely learns from the
+foundation priors, without the need of human-specified dense reward, or
+providing teleoperated demos. Thus, FAC can be easily scaled up. We believe our
+FRL framework could enable the future robot to autonomously explore and learn
+without human intervention in the physical world. In summary, our proposed FRL
+is a novel and powerful learning paradigm, towards achieving embodied
+generalist agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-rules mining algorithm for combinatorially exploded decision trees
+  with modified Aitchison-Aitken function-based Bayesian optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuto Omae, Masaya Mori, Yohei Kakimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision trees offer the benefit of easy interpretation because they allow
+the classification of input data based on if--then rules. However, as decision
+trees are constructed by an algorithm that achieves clear classification with
+minimum necessary rules, the trees possess the drawback of extracting only
+minimum rules, even when various latent rules exist in data. Approaches that
+construct multiple trees using randomly selected feature subsets do exist.
+However, the number of trees that can be constructed remains at the same scale
+because the number of feature subsets is a combinatorial explosion.
+Additionally, when multiple trees are constructed, numerous rules are
+generated, of which several are untrustworthy and/or highly similar. Therefore,
+we propose "MAABO-MT" and "GS-MRM" algorithms that strategically construct
+trees with high estimation performance among all possible trees with small
+computational complexity and extract only reliable and non-similar rules,
+respectively. Experiments are conducted using several open datasets to analyze
+the effectiveness of the proposed method. The results confirm that MAABO-MT can
+discover reliable rules at a lower computational cost than other methods that
+rely on randomness. Furthermore, the proposed method is confirmed to provide
+deeper insights than single decision trees commonly used in previous studies.
+Therefore, MAABO-MT and GS-MRM can efficiently extract rules from
+combinatorially exploded decision trees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Modeling of Regular and Irregular Time Series Data via
+  Koopman VAEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilan Naiman, N. Benjamin Erichson, Pu Ren, Michael W. Mahoney, Omri Azencot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating realistic time series data is important for many engineering and
+scientific applications. Existing work tackles this problem using generative
+adversarial networks (GANs). However, GANs are often unstable during training,
+and they can suffer from mode collapse. While variational autoencoders (VAEs)
+are known to be more robust to these issues, they are (surprisingly) less often
+considered for time series generation. In this work, we introduce Koopman VAE
+(KVAE), a new generative framework that is based on a novel design for the
+model prior, and that can be optimized for either regular and irregular
+training data. Inspired by Koopman theory, we represent the latent conditional
+prior dynamics using a linear map. Our approach enhances generative modeling
+with two desired features: (i) incorporating domain knowledge can be achieved
+by leverageing spectral tools that prescribe constraints on the eigenvalues of
+the linear map; and (ii) studying the qualitative behavior and stablity of the
+system can be performed using tools from dynamical systems theory. Our results
+show that KVAE outperforms state-of-the-art GAN and VAE methods across several
+challenging synthetic and real-world time series generation benchmarks. Whether
+trained on regular or irregular data, KVAE generates time series that improve
+both discriminative and predictive metrics. We also present visual evidence
+suggesting that KVAE learns probability density functions that better
+approximate empirical ground truth distributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing and Improving OT-based Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemoo Choi, Jaewoong Choi, Myungjoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) problem aims to find a transport plan that bridges two
+distributions while minimizing a given cost function. OT theory has been widely
+utilized in generative modeling. In the beginning, OT distance has been used as
+a measure for assessing the distance between data and generated distributions.
+Recently, OT transport map between data and prior distributions has been
+utilized as a generative model. These OT-based generative models share a
+similar adversarial training objective. In this paper, we begin by unifying
+these OT-based adversarial methods within a single framework. Then, we
+elucidate the role of each component in training dynamics through a
+comprehensive analysis of this unified framework. Moreover, we suggest a simple
+but novel method that improves the previously best-performing OT-based model.
+Intuitively, our approach conducts a gradual refinement of the generated
+distribution, progressively aligning it with the data distribution. Our
+approach achieves a FID score of 2.51 on CIFAR-10, outperforming unified
+OT-based adversarial approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning adjacency matrix for dynamic graph neural network <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Ahmad, Omer Abdul Jalil, Usman Nazir, Murtaza Taj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent work, [1] introduced the concept of using a Block Adjacency Matrix
+(BA) for the representation of spatio-temporal data. While their method
+successfully concatenated adjacency matrices to encapsulate spatio-temporal
+relationships in a single graph, it formed a disconnected graph. This
+limitation hampered the ability of Graph Convolutional Networks (GCNs) to
+perform message passing across nodes belonging to different time steps, as no
+temporal links were present. To overcome this challenge, we introduce an
+encoder block specifically designed to learn these missing temporal links. The
+encoder block processes the BA and predicts connections between previously
+unconnected subgraphs, resulting in a Spatio-Temporal Block Adjacency Matrix
+(STBAM). This enriched matrix is then fed into a Graph Neural Network (GNN) to
+capture the complex spatio-temporal topology of the network. Our evaluations on
+benchmark datasets, surgVisDom and C2D2, demonstrate that our method, with
+slightly higher complexity, achieves superior results compared to
+state-of-the-art results. Our approach's computational overhead remains
+significantly lower than conventional non-graph-based methodologies for
+spatio-temporal data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Agent Reinforcement Learning for Power Grid Topology Optimization <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erica van der Sar, Alessandro Zocca, Sandjai Bhulai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent challenges in operating power networks arise from increasing energy
+demands and unpredictable renewable sources like wind and solar. While
+reinforcement learning (RL) shows promise in managing these networks, through
+topological actions like bus and line switching, efficiently handling large
+action spaces as networks grow is crucial. This paper presents a hierarchical
+multi-agent reinforcement learning (MARL) framework tailored for these
+expansive action spaces, leveraging the power grid's inherent hierarchical
+nature. Experimental results indicate the MARL framework's competitive
+performance with single-agent RL methods. We also compare different RL
+algorithms for lower-level agents alongside different policies for higher-order
+agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to PSCC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViT-ReciproCAM: Gradient and Attention-Free Visual Explanations for
+  Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seok-Yong Byun, Wonju Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to address the challenges of
+understanding the prediction process and debugging prediction errors in Vision
+Transformers (ViT), which have demonstrated superior performance in various
+computer vision tasks such as image classification and object detection. While
+several visual explainability techniques, such as CAM, Grad-CAM, Score-CAM, and
+Recipro-CAM, have been extensively researched for Convolutional Neural Networks
+(CNNs), limited research has been conducted on ViT. Current state-of-the-art
+solutions for ViT rely on class agnostic Attention-Rollout and Relevance
+techniques. In this work, we propose a new gradient-free visual explanation
+method for ViT, called ViT-ReciproCAM, which does not require attention matrix
+and gradient information. ViT-ReciproCAM utilizes token masking and generated
+new layer outputs from the target layer's input to exploit the correlation
+between activated tokens and network predictions for target classes. Our
+proposed method outperforms the state-of-the-art Relevance method in the
+Average Drop-Coherence-Complexity (ADCC) metric by $4.58\%$ to $5.80\%$ and
+generates more localized saliency maps. Our experiments demonstrate the
+effectiveness of ViT-ReciproCAM and showcase its potential for understanding
+and debugging ViT models. Our proposed method provides an efficient and
+easy-to-implement alternative for generating visual explanations, without
+requiring attention and gradient information, which can be beneficial for
+various applications in the field of computer vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ConR: Contrastive Regularizer for Deep Imbalanced Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06651v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06651v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Keramati, Lili Meng, R. David Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imbalanced distributions are ubiquitous in real-world data. They create
+constraints on Deep Neural Networks to represent the minority labels and avoid
+bias towards majority labels. The extensive body of imbalanced approaches
+address categorical label spaces but fail to effectively extend to regression
+problems where the label space is continuous. Local and global correlations
+among continuous labels provide valuable insights towards effectively modelling
+relationships in feature space. In this work, we propose ConR, a contrastive
+regularizer that models global and local label similarities in feature space
+and prevents the features of minority samples from being collapsed into their
+majority neighbours. ConR discerns the disagreements between the label space
+and feature space and imposes a penalty on these disagreements. ConR addresses
+the continuous nature of label space with two main strategies in a contrastive
+manner: incorrect proximities are penalized proportionate to the label
+similarities and the correct ones are encouraged to model local similarities.
+ConR consolidates essential considerations into a generic, easy-to-integrate,
+and efficient method that effectively addresses deep imbalanced regression.
+Moreover, ConR is orthogonal to existing approaches and smoothly extends to
+uni- and multi-dimensional label spaces. Our comprehensive experiments show
+that ConR significantly boosts the performance of all the state-of-the-art
+methods on four large-scale deep imbalanced regression benchmarks. Our code is
+publicly available in https://github.com/BorealisAI/ConR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Adaptive Safety for Multi-Agent Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luigi Berducci, Shuo Yang, Rahul Mangharam, Radu Grosu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring safety in dynamic multi-agent systems is challenging due to limited
+information about the other agents. Control Barrier Functions (CBFs) are
+showing promise for safety assurance but current methods make strong
+assumptions about other agents and often rely on manual tuning to balance
+safety, feasibility, and performance. In this work, we delve into the problem
+of adaptive safe learning for multi-agent systems with CBF. We show how
+emergent behavior can be profoundly influenced by the CBF configuration,
+highlighting the necessity for a responsive and dynamic approach to CBF design.
+We present ASRL, a novel adaptive safe RL framework, to fully automate the
+optimization of policy and CBF coefficients, to enhance safety and long-term
+performance through reinforcement learning. By directly interacting with the
+other agents, ASRL learns to cope with diverse agent behaviours and maintains
+the cost violations below a desired limit. We evaluate ASRL in a multi-robot
+system and a competitive multi-agent racing scenario, against learning-based
+and control-theoretic approaches. We empirically demonstrate the efficacy and
+flexibility of ASRL, and assess generalization and scalability to
+out-of-distribution scenarios. Code and supplementary material are public
+online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update with appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faithful and Efficient Explanations for Neural Networks via Neural
+  Tangent Kernel Surrogate Models <span class="chip">ICLR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14585v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14585v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Engel, Zhichao Wang, Natalie S. Frank, Ioana Dumitriu, Sutanay Choudhury, Anand Sarwate, Tony Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recent trend in explainable AI research has focused on surrogate modeling,
+where neural networks are approximated as simpler ML algorithms such as kernel
+machines. A second trend has been to utilize kernel functions in various
+explain-by-example or data attribution tasks to investigate a diverse set of
+neural network behavior. In this work, we combine these two trends to analyze
+approximate empirical neural tangent kernels (eNTK) for data attribution.
+Approximation is critical for eNTK analysis due to the high computational cost
+to compute the eNTK. We define new approximate eNTK and perform novel analysis
+on how well the resulting kernel machine surrogate models correlate with the
+underlying neural network. We introduce two new random projection variants of
+approximate eNTK which allow users to tune the time and memory complexity of
+their calculation. We conclude that kernel machines using approximate neural
+tangent kernel as the kernel function are effective surrogate models, with the
+introduced trace NTK the most consistent performer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated 10/4/2023: significant changes for ICLR2023 submission.
+  Github repository will be live soon. 9 pages, 2 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private Ad Modeling with DP-SGD <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11896v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11896v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carson Denison, Badih Ghazi, Pritish Kamath, Ravi Kumar, Pasin Manurangsi, Krishna Giri Narra, Amer Sinha, Avinash V Varadarajan, Chiyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A well-known algorithm in privacy-preserving ML is differentially private
+stochastic gradient descent (DP-SGD). While this algorithm has been evaluated
+on text and image data, it has not been previously applied to ads data, which
+are notorious for their high class imbalance and sparse gradient updates. In
+this work we apply DP-SGD to several ad modeling tasks including predicting
+click-through rates, conversion rates, and number of conversion events, and
+evaluate their privacy-utility trade-off on real-world datasets. Our work is
+the first to empirically demonstrate that DP-SGD can provide both privacy and
+utility for ad modeling tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AdKDD 2023, 8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme
+  Long Sequence <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computation in a typical Transformer-based large language model (LLM) can be
+characterized by batch size, hidden dimension, number of layers, and sequence
+length. Until now, system works for accelerating LLM training have focused on
+the first three dimensions: data parallelism for batch size, tensor parallelism
+for hidden size and pipeline parallelism for model depth or layers. These
+widely studied forms of parallelism are not targeted or optimized for long
+sequence Transformer models. Given practical application needs for long
+sequence LLM, renewed attentions are being drawn to sequence parallelism.
+However, existing works in sequence parallelism are constrained by
+memory-communication inefficiency, limiting their scalability to long sequence
+large models. In this work, we introduce DeepSpeed-Ulysses, a novel, portable
+and effective methodology for enabling highly efficient and scalable LLM
+training with extremely long sequence length. DeepSpeed-Ulysses at its core
+partitions input data along the sequence dimension and employs an efficient
+all-to-all collective communication for attention computation. Theoretical
+communication analysis shows that whereas other methods incur communication
+overhead as sequence length increases, DeepSpeed-Ulysses maintains constant
+communication volume when sequence length and compute devices are increased
+proportionally. Furthermore, experimental evaluations show that
+DeepSpeed-Ulysses trains 2.5x faster with 4x longer sequence length than the
+existing method SOTA baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Adversarial Objectives for <span class="highlight-title">Self-Supervised</span> Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Zhang, Michael Maire
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the framework of generative adversarial networks (GANs), we propose
+objectives that task the discriminator for self-supervised representation
+learning via additional structural modeling responsibilities. In combination
+with an efficient smoothness regularizer imposed on the network, these
+objectives guide the discriminator to learn to extract informative
+representations, while maintaining a generator capable of sampling from the
+domain. Specifically, our objectives encourage the discriminator to structure
+features at two levels of granularity: aligning distribution characteristics,
+such as mean and variance, at coarse scales, and grouping features into local
+clusters at finer scales. Operating as a feature learner within the GAN
+framework frees our self-supervised system from the reliance on hand-crafted
+data augmentation schemes that are prevalent across contrastive representation
+learning methods. Across CIFAR-10/100 and an ImageNet subset, experiments
+demonstrate that equipping GANs with our self-supervised objectives suffices to
+produce discriminators which, evaluated in terms of representation learning,
+compete with networks trained by contrastive learning approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Trajectory balance: Improved credit assignment in GFlowNets <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.13259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.13259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolay Malkin, Moksh Jain, Emmanuel Bengio, Chen Sun, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative flow networks (GFlowNets) are a method for learning a stochastic
+policy for generating compositional objects, such as graphs or strings, from a
+given unnormalized density by sequences of actions, where many possible action
+sequences may lead to the same object. We find previously proposed learning
+objectives for GFlowNets, flow matching and detailed balance, which are
+analogous to temporal difference learning, to be prone to inefficient credit
+propagation across long action sequences. We thus propose a new learning
+objective for GFlowNets, trajectory balance, as a more efficient alternative to
+previously used objectives. We prove that any global minimizer of the
+trajectory balance objective can define a policy that samples exactly from the
+target distribution. In experiments on four distinct domains, we empirically
+demonstrate the benefits of the trajectory balance objective for GFlowNet
+convergence, diversity of generated samples, and robustness to long action
+sequences and large action spaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022; see footnotes for code; v3 fixes minor errata</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian low-rank adaptation for large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13111v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13111v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam X. Yang, Maxime Robeyns, Xi Wang, Laurence Aitchison
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-rank adaptation (LoRA) has emerged as a new paradigm for cost-efficient
+fine-tuning of large language models (LLMs). However, fine-tuned LLMs often
+become overconfident especially when fine-tuned on small datasets. Bayesian
+methods, with their inherent ability to estimate uncertainty, serve as potent
+tools to mitigate overconfidence and enhance calibration. In this work, we
+introduce Laplace-LoRA, which applies a Bayesian approach to the LoRA
+parameters. Specifically, Laplace-LoRA applies a Laplace approximation to the
+posterior over the LoRA parameters, considerably improving the calibration of
+fine-tuned LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-Series Forecasting: Unleashing Long-Term Dependencies with
+  Fractionally Differenced Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarit Maitra, Vivek Mishra, Srashti Dwivedi, Sukanya Kundu, Goutam Kumar Kundu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces a novel forecasting strategy that leverages the power
+of fractional differencing (FD) to capture both short- and long-term
+dependencies in time series data. Unlike traditional integer differencing
+methods, FD preserves memory in series while stabilizing it for modeling
+purposes. By applying FD to financial data from the SPY index and incorporating
+sentiment analysis from news reports, this empirical analysis explores the
+effectiveness of FD in conjunction with binary classification of target
+variables. Supervised classification algorithms were employed to validate the
+performance of FD series. The results demonstrate the superiority of FD over
+integer differencing, as confirmed by Receiver Operating Characteristic/Area
+Under the Curve (ROCAUC) and Mathews Correlation Coefficient (MCC) evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Use Perturbations when Learning from Explanations <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juyeon Heo, Vihari Piratla, Matthew Wicker, Adrian Weller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning from explanations (MLX) is an approach to learning that uses
+human-provided explanations of relevant or irrelevant features for each input
+to ensure that model predictions are right for the right reasons. Existing MLX
+approaches rely on local model interpretation methods and require strong model
+smoothing to align model and human explanations, leading to sub-optimal
+performance. We recast MLX as a robustness problem, where human explanations
+specify a lower dimensional manifold from which perturbations can be drawn, and
+show both theoretically and empirically how this approach alleviates the need
+for strong model smoothing. We consider various approaches to achieving
+robustness, leading to improved performance over prior MLX methods. Finally, we
+show how to combine robustness with an earlier MLX method, yielding
+state-of-the-art results on both synthetic and real-world benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Module-wise Training of Neural Networks via the Minimizing Movement
+  Scheme <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skander Karkar, Ibrahim Ayed, Emmanuel de Bézenac, Patrick Gallinari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Greedy layer-wise or module-wise training of neural networks is compelling in
+constrained and on-device settings where memory is limited, as it circumvents a
+number of problems of end-to-end back-propagation. However, it suffers from a
+stagnation problem, whereby early layers overfit and deeper layers stop
+increasing the test accuracy after a certain depth. We propose to solve this
+issue by introducing a module-wise regularization inspired by the minimizing
+movement scheme for gradient flows in distribution space. We call the method
+TRGL for Transport Regularized Greedy Learning and study it theoretically,
+proving that it leads to greedy modules that are regular and that progressively
+solve the task. Experimentally, we show improved accuracy of module-wise
+training of various architectures such as ResNets, Transformers and VGG, when
+our regularization is added, superior to that of other module-wise training
+methods and often to end-to-end training, with as much as 60% less memory
+usage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. arXiv admin note: text overlap with arXiv:2210.00949</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Probabilistic Image-Text Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyuk Chun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,
+suffers from the inherent ambiguity arising from multiplicity and imperfect
+annotations. Deterministic functions are not sufficiently powerful to capture
+ambiguity, prompting the exploration of probabilistic embeddings to tackle the
+challenge. However, the existing probabilistic ITM approach encounters two key
+shortcomings; the burden of heavy computations due to the Monte Carlo
+approximation, and the loss saturation issue in the face of abundant false
+negatives. To overcome the issues, this paper presents an improved
+Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new
+probabilistic distance with a closed-form solution. In addition, two
+optimization techniques are proposed to enhance PCME++ further; first, the
+incorporation of pseudo-positives to prevent the loss saturation problem under
+massive false negatives; second, mixed sample data augmentation for
+probabilistic matching. Experimental results on MS-COCO Caption and two
+extended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of
+PCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is
+also evaluated under noisy image-text correspondences. In addition, the
+potential applicability of PCME++ in automatic prompt tuning for zero-shot
+classification is shown. The code is available at
+https://naver-ai.github.io/pcmepp/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/naver-ai/pcmepp. Project page:
+  https://naver-ai.github.io/pcmepp/. 26 pages, 1.2 MB</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large-Batch, Iteration-Efficient Neural Bayesian Design Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01095v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01095v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Ansari, Hans-Peter Seidel, Vahid Babaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) provides a powerful framework for optimizing
+black-box, expensive-to-evaluate functions. It is therefore an attractive tool
+for engineering design problems, typically involving multiple objectives.
+Thanks to the rapid advances in fabrication and measurement methods as well as
+parallel computing infrastructure, querying many design problems can be heavily
+parallelized. This class of problems challenges BO with an unprecedented setup
+where it has to deal with very large batches, shifting its focus from sample
+efficiency to iteration efficiency. We present a novel Bayesian optimization
+framework specifically tailored to address these limitations. Our key
+contribution is a highly scalable, sample-based acquisition function that
+performs a non-dominated sorting of not only the objectives but also their
+associated uncertainty. We show that our acquisition function in combination
+with different Bayesian neural network surrogates is effective in
+data-intensive environments with a minimal number of iterations. We demonstrate
+the superiority of our method by comparing it with state-of-the-art
+multi-objective optimizations. We perform our evaluation on two real-world
+problems -- airfoil design and 3D printing -- showcasing the applicability and
+efficiency of our approach. Our code is available at:
+https://github.com/an-on-ym-ous/lbn_mobo
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Stochastic Mechanics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19685v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19685v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Orlova, Aleksei Ustimenko, Ruoxi Jiang, Peter Y. Lu, Rebecca Willett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel deep-learning-based approach for numerical
+simulation of a time-evolving Schr\"odinger equation inspired by stochastic
+mechanics and generative diffusion models. Unlike existing approaches, which
+exhibit computational complexity that scales exponentially in the problem
+dimension, our method allows us to adapt to the latent low-dimensional
+structure of the wave function by sampling from the Markovian diffusion.
+Depending on the latent dimension, our method may have far lower computational
+complexity in higher dimensions. Moreover, we propose novel equations for
+stochastic quantum mechanics, resulting in linear computational complexity with
+respect to the number of dimensions. Numerical simulations verify our
+theoretical findings and show a significant advantage of our method compared to
+other deep-learning-based approaches used for quantum mechanics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Type Inference for Enhanced Dataflow Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Seidel, Sedick David Baker Effendi, Xavier Pinho, Konrad Rieck, Brink van der Merwe, Fabian Yamaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statically analyzing dynamically-typed code is a challenging endeavor, as
+even seemingly trivial tasks such as determining the targets of procedure calls
+are non-trivial without knowing the types of objects at compile time.
+Addressing this challenge, gradual typing is increasingly added to
+dynamically-typed languages, a prominent example being TypeScript that
+introduces static typing to JavaScript. Gradual typing improves the developer's
+ability to verify program behavior, contributing to robust, secure and
+debuggable programs. In practice, however, users only sparsely annotate types
+directly. At the same time, conventional type inference faces
+performance-related challenges as program size grows. Statistical techniques
+based on machine learning offer faster inference, but although recent
+approaches demonstrate overall improved accuracy, they still perform
+significantly worse on user-defined types than on the most common built-in
+types. Limiting their real-world usefulness even more, they rarely integrate
+with user-facing applications. We propose CodeTIDAL5, a Transformer-based model
+trained to reliably predict type annotations. For effective result retrieval
+and re-integration, we extract usage slices from a program's code property
+graph. Comparing our approach against recent neural type inference systems, our
+model outperforms the current state-of-the-art by 7.85% on the
+ManyTypes4TypeScript benchmark, achieving 71.27% accuracy overall. Furthermore,
+we present JoernTI, an integration of our approach into Joern, an open source
+static analysis tool, and demonstrate that the analysis benefits from the
+additional type information. As our model allows for fast inference times even
+on commodity CPUs, making our system available through Joern leads to high
+accessibility and facilitates security research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>- fixed last author's name - fixed header</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Implicit Regularization of ReLU Neural Networks Characterizes the
+  Learned Function -- Part I: the 1-D Case of Two Layers with Random First
+  Layer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1911.02903v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1911.02903v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Heiss, Josef Teichmann, Hanna Wutte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider one dimensional (shallow) ReLU neural networks in
+which weights are chosen randomly and only the terminal layer is trained.
+First, we mathematically show that for such networks L2-regularized regression
+corresponds in function space to regularizing the estimate's second derivative
+for fairly general loss functionals. For least squares regression, we show that
+the trained network converges to the smooth spline interpolation of the
+training data as the number of hidden nodes tends to infinity. Moreover, we
+derive a novel correspondence between the early stopped gradient descent
+(without any explicit regularization of the weights) and the smoothing spline
+regression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>adding Appendix C for more intuition, fixing typos, improving
+  formulations, (moving end of Section 3.1 into Appendix B)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction Tuning for Large Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10792v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10792v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Zhang, Linfeng Dong, Xiaoya Li, Sen Zhang, Xiaofei Sun, Shuhe Wang, Jiwei Li, Runyi Hu, Tianwei Zhang, Fei Wu, Guoyin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper surveys research works in the quickly advancing field of
+instruction tuning (IT), a crucial technique to enhance the capabilities and
+controllability of large language models (LLMs). Instruction tuning refers to
+the process of further training LLMs on a dataset consisting of
+\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the
+gap between the next-word prediction objective of LLMs and the users' objective
+of having LLMs adhere to human instructions. In this work, we make a systematic
+review of the literature, including the general methodology of IT, the
+construction of IT datasets, the training of IT models, and applications to
+different modalities, domains and applications, along with an analysis on
+aspects that influence the outcome of IT (e.g., generation of instruction
+outputs, size of the instruction dataset, etc). We also review the potential
+pitfalls of IT along with criticism against it, along with efforts pointing out
+current deficiencies of existing strategies and suggest some avenues for
+fruitful research. Project page: github.com/xiaoya-li/Instruction-Tuning-Survey
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Survey paper, Pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computing high-dimensional optimal transport by flow neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11857v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11857v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Xu, Xiuyuan Cheng, Yao Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flow-based models are widely used in generative tasks, including normalizing
+flow, where a neural network transports from a data distribution $P$ to a
+normal distribution. This work develops a flow-based model that transports from
+$P$ to an arbitrary $Q$ where both distributions are only accessible via finite
+samples. We propose to learn the dynamic optimal transport between $P$ and $Q$
+by training a flow neural network. The model is trained to find an invertible
+transport map between $P$ and $Q$ optimally by minimizing the transport cost.
+The trained optimal transport flow allows for performing many downstream tasks,
+including infinitesimal density ratio estimation and distribution interpolation
+in the latent space for generative models. The effectiveness of the proposed
+model on high-dimensional data is empirically demonstrated in mutual
+information estimation, energy-based generative models, and image-to-image
+translation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Re-weighted Gradient Descent via Distributionally Robust
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramnath Kumar, Kushal Majmundar, Dheeraj Nagaraj, Arun Sai Suggala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a re-weighted gradient descent technique for boosting the
+performance of deep neural networks, which involves importance weighting of
+data points during each optimization step. Our approach is inspired by
+distributionally robust optimization with f-divergences, which has been known
+to result in models with improved generalization guarantees. Our re-weighting
+scheme is simple, computationally efficient, and can be combined with many
+popular optimization algorithms such as SGD and Adam. Empirically, we
+demonstrate the superiority of our approach on various tasks, including
+supervised learning, domain adaptation. Notably, we obtain improvements of
++0.7% and +1.44% over SOTA on DomainBed and Tabular classification benchmarks,
+respectively. Moreover, our algorithm boosts the performance of BERT on GLUE
+benchmarks by +1.94%, and ViT on ImageNet-1K by +1.01%. These results
+demonstrate the effectiveness of the proposed approach, indicating its
+potential for improving performance in diverse domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the efficiency of Stochastic Quasi-Newton Methods for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09121v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09121v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Yousefi, Angeles Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While first-order methods are popular for solving optimization problems that
+arise in large-scale deep learning problems, they come with some acute
+deficiencies. To diminish such shortcomings, there has been recent interest in
+applying second-order methods such as quasi-Newton based methods which
+construct Hessians approximations using only gradient information. The main
+focus of our work is to study the behaviour of stochastic quasi-Newton
+algorithms for training deep neural networks. We have analyzed the performance
+of two well-known quasi-Newton updates, the limited memory
+Broyden-Fletcher-Goldfarb-Shanno (BFGS) and the Symmetric Rank One (SR1). This
+study fills a gap concerning the real performance of both updates and analyzes
+whether more efficient training is obtained when using the more robust BFGS
+update or the cheaper SR1 formula which allows for indefinite Hessian
+approximations and thus can potentially help to better navigate the
+pathological saddle points present in the non-convex loss functions found in
+deep learning. We present and discuss the results of an extensive experimental
+study which includes the effect of batch normalization and network's
+architecture, the limited memory parameter, the batch size and the type of
+sampling strategy. we show that stochastic quasi-Newton optimizers are
+efficient and able to outperform in some instances the well-known first-order
+Adam optimizer run with the optimal combination of its numerous
+hyperparameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to fix a broken confidence estimator: Evaluating post-hoc methods
+  for selective classification with deep neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15508v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15508v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luís Felipe P. Cattelan, Danilo Silva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of selective classification for deep neural
+networks, where a model is allowed to abstain from low-confidence predictions
+to avoid potential errors. We focus on so-called post-hoc methods, which
+replace the confidence estimator of a given classifier without retraining or
+modifying it, thus being practically appealing. Considering neural networks
+with softmax outputs, our goal is to identify the best confidence estimator
+that can be computed directly from the unnormalized logits. This problem is
+motivated by the intriguing observation in recent work that many classifiers
+appear to have a "broken" confidence estimator, in the sense that their
+selective classification performance is much worse than what could be expected
+by their corresponding accuracies. We perform an extensive experimental study
+of many existing and proposed confidence estimators applied to 84 pretrained
+ImageNet classifiers available from popular repositories. Our results show that
+a simple $p$-norm normalization of the logits, followed by taking the maximum
+logit as the confidence estimator, can lead to considerable gains in selective
+classification performance, completely fixing the pathological behavior
+observed in many classifiers. As a consequence, the selective classification
+performance of any classifier becomes almost entirely determined by its
+corresponding accuracy. Moreover, these results are shown to be consistent
+under distribution shift. We also investigate why certain classifiers innately
+have a good confidence estimator that apparently cannot be improved by post-hoc
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PersA-FL: Personalized Asynchronous Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Taha Toghani, Soomin Lee, César A. Uribe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the personalized federated learning problem under asynchronous
+updates. In this problem, each client seeks to obtain a personalized model that
+simultaneously outperforms local and global models. We consider two
+optimization-based frameworks for personalization: (i) Model-Agnostic
+Meta-Learning (MAML) and (ii) Moreau Envelope (ME). MAML involves learning a
+joint model adapted for each client through fine-tuning, whereas ME requires a
+bi-level optimization problem with implicit gradients to enforce
+personalization via regularized losses. We focus on improving the scalability
+of personalized federated learning by removing the synchronous communication
+assumption. Moreover, we extend the studied function class by removing
+boundedness assumptions on the gradient norm. Our main technical contribution
+is a unified proof for asynchronous federated learning with bounded staleness
+that we apply to MAML and ME personalization frameworks. For the smooth and
+non-convex functions class, we show the convergence of our method to a
+first-order stationary point. We illustrate the performance of our method and
+its tolerance to staleness through experiments for classification tasks over
+heterogeneous datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explaining $\mathcal{ELH}$ Concept Descriptions through Counterfactual
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonie Nora Sieger, Stefan Heindorf, Yasir Mahmood, Lukas Blübaum, Axel-Cyrille Ngonga Ngomo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge bases are widely used for information management, enabling
+high-impact applications such as web search, question answering, and natural
+language processing. They also serve as the backbone for automatic decision
+systems, e.g., for medical diagnostics and credit scoring. As stakeholders
+affected by these decisions would like to understand their situation and verify
+how fair the decisions are, a number of explanation approaches have been
+proposed. An intrinsically transparent way to do classification is by using
+concepts in description logics. However, these concepts can become long and
+difficult to fathom for non-experts, even when verbalized. One solution is to
+employ counterfactuals to answer the question, ``How must feature values be
+changed to obtain a different classification?'' By focusing on the minimal
+feature changes, the explanations are short, human-friendly, and provide a
+clear path of action regarding the change in prediction. While previous work
+investigated counterfactuals for tabular data, in this paper, we transfer the
+notion of counterfactuals to knowledge bases and the description logic
+$\mathcal{ELH}$. Our approach starts by generating counterfactual candidates
+from concepts, followed by selecting the candidates requiring the fewest
+feature changes as counterfactuals. When multiple counterfactuals exist, we
+rank them based on the likeliness of their feature combinations. We evaluate
+our method by conducting a user survey to determine which counterfactual
+candidates participants prefer for explanation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rotational Equilibrium: How Weight Decay Balances Learning Across Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17212v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17212v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atli Kosson, Bettina Messmer, Martin Jaggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weight decay can significantly impact the optimization dynamics of deep
+neural networks. In certain situations the effects of weight decay and gradient
+updates on the magnitude of a parameter vector cancel out on average, forming a
+state known as equilibrium. This causes the expected rotation of the vector in
+each update to remain constant along with its magnitude. Importantly,
+equilibrium can arise independently for the weight vectors of different layers
+and neurons. These equilibria are highly homogeneous for some optimizer and
+normalization configurations, effectively balancing the average rotation--a
+proxy for the effective learning rate--across network components. In this work
+we explore the equilibrium states of multiple optimizers including AdamW and
+SGD with momentum, providing insights into interactions between the learning
+rate, weight decay, initialization, normalization and learning rate schedule.
+We show how rotational equilibrium can be enforced throughout training,
+eliminating the chaotic transient phase corresponding to the transition towards
+equilibrium, thus simplifying the training dynamics. Finally, we show that
+rotational behavior may play a key role in the effectiveness of AdamW compared
+to Adam with L2-regularization, the performance of different normalization
+layers, and the need for learning rate warmup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/epfml/rotational-optimizers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Zero to Turbulence: Generative Modeling for 3D Flow Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01776v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01776v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marten Lienen, David Lüdke, Jan Hansen-Palmus, Stephan Günnemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulations of turbulent flows in 3D are one of the most expensive
+simulations in computational fluid dynamics (CFD). Many works have been written
+on surrogate models to replace numerical solvers for fluid flows with faster,
+learned, autoregressive models. However, the intricacies of turbulence in three
+dimensions necessitate training these models with very small time steps, while
+generating realistic flow states requires either long roll-outs with many steps
+and significant error accumulation or starting from a known, realistic flow
+state - something we aimed to avoid in the first place. Instead, we propose to
+approach turbulent flow simulation as a generative task directly learning the
+manifold of all possible turbulent flow states without relying on any initial
+flow state. For our experiments, we introduce a challenging 3D turbulence
+dataset of high-resolution flows and detailed vortex structures caused by
+various objects and derive two novel sample evaluation metrics for turbulent
+flows. On this dataset, we show that our generative model captures the
+distribution of turbulent flows caused by unseen objects and generates
+high-quality, realistic samples amenable for downstream applications without
+access to any initial state.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expanding Small-Scale <span class="highlight-title">Dataset</span>s with Guided Imagination <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13976v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13976v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Daquan Zhou, Bryan Hooi, Kai Wang, Jiashi Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The power of DNNs relies heavily on the quantity and quality of training
+data. However, collecting and annotating data on a large scale is often
+expensive and time-consuming. To address this issue, we explore a new task,
+termed dataset expansion, aimed at expanding a ready-to-use small dataset by
+automatically creating new labeled samples. To this end, we present a Guided
+Imagination Framework (GIF) that leverages cutting-edge generative models like
+DALL-E2 and Stable Diffusion (SD) to "imagine" and create informative new data
+from the input seed data. Specifically, GIF conducts data imagination by
+optimizing the latent features of the seed data in the semantically meaningful
+space of the prior model, resulting in the creation of photo-realistic images
+with new content. To guide the imagination towards creating informative samples
+for model training, we introduce two key criteria, i.e., class-maintained
+information boosting and sample diversity promotion. These criteria are
+verified to be essential for effective dataset expansion: GIF-SD obtains 13.5%
+higher model accuracy on natural image datasets than unguided expansion with
+SD. With these essential criteria, GIF successfully expands small datasets in
+various scenarios, boosting model accuracy by 36.9% on average over six natural
+image datasets and by 13.5% on average over three medical datasets. The source
+code is available at https://github.com/Vanint/DatasetExpansion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AQUILA: Communication Efficient Federated Learning with Adaptive
+  Quantization in Device Selection Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00258v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00258v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Zhao, Yuzhu Mao, Zhenpeng Shi, Yang Liu, Tian Lan, Wenbo Ding, Xiao-Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread adoption of Federated Learning (FL), a privacy-preserving
+distributed learning methodology, has been impeded by the challenge of high
+communication overheads, typically arising from the transmission of large-scale
+models. Existing adaptive quantization methods, designed to mitigate these
+overheads, operate under the impractical assumption of uniform device
+participation in every training round. Additionally, these methods are limited
+in their adaptability due to the necessity of manual quantization level
+selection and often overlook biases inherent in local devices' data, thereby
+affecting the robustness of the global model. In response, this paper
+introduces AQUILA (adaptive quantization in device selection strategy), a novel
+adaptive framework devised to effectively handle these issues, enhancing the
+efficiency and robustness of FL. AQUILA integrates a sophisticated device
+selection method that prioritizes the quality and usefulness of device updates.
+Utilizing the exact global model stored by devices, it enables a more precise
+device selection criterion, reduces model deviation, and limits the need for
+hyperparameter adjustments. Furthermore, AQUILA presents an innovative
+quantization criterion, optimized to improve communication efficiency while
+assuring model convergence. Our experiments demonstrate that AQUILA
+significantly decreases communication costs compared to existing methods, while
+maintaining comparable model performance across diverse non-homogeneous FL
+settings, such as Non-IID data and heterogeneous model architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Speciality vs Generality: An Empirical Study on Catastrophic Forgetting
+  in Fine-tuning Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06256v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06256v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Lin, Lu Tan, Hangyu Lin, Zeming Zheng, Renjie Pi, Jipeng Zhang, Shizhe Diao, Haoxiang Wang, Han Zhao, Yuan Yao, Tong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models, including Vision Language Models (VLMs) and Large Language
+Models (LLMs), possess the $generality$ to handle diverse distributions and
+tasks, which stems from their extensive pre-training datasets. The fine-tuning
+of foundation models is a common practice to enhance task performance or align
+the model's behavior with human expectations, allowing them to gain
+$speciality$. However, the small datasets used for fine-tuning may not
+adequately cover the diverse distributions and tasks encountered during
+pre-training. Consequently, the pursuit of speciality during fine-tuning can
+lead to a loss of {generality} in the model, which is related to catastrophic
+forgetting (CF) in deep learning. In this study, we demonstrate this phenomenon
+in both VLMs and LLMs. For instance, fine-tuning VLMs like CLIP on ImageNet
+results in a loss of generality in handling diverse distributions, and
+fine-tuning LLMs like Galactica in the medical domain leads to a loss in
+following instructions and common sense.
+  To address the trade-off between the speciality and generality, we
+investigate multiple regularization methods from continual learning, the weight
+averaging method (Wise-FT) from out-of-distributional (OOD) generalization,
+which interpolates parameters between pre-trained and fine-tuned models, and
+parameter-efficient fine-tuning methods like Low-Rank Adaptation (LoRA). Our
+findings show that both continual learning and Wise-ft methods effectively
+mitigate the loss of generality, with Wise-FT exhibiting the strongest
+performance in balancing speciality and generality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variantional autoencoder with decremental information bottleneck for
+  disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiantao Wu, Shentong Mo, Xiang Yang, Muhammad Awais, Sara Atito, Xingshen Zhang, Lin Wang, Xiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One major challenge of disentanglement learning with variational autoencoders
+is the trade-off between disentanglement and reconstruction fidelity. Previous
+studies, which increase the information bottleneck during training, tend to
+lose the constraint of disentanglement, leading to the information diffusion
+problem. In this paper, we present a novel framework for disentangled
+representation learning, DeVAE, which utilizes hierarchical latent spaces with
+decreasing information bottlenecks across these spaces. The key innovation of
+our approach lies in connecting the hierarchical latent spaces through
+disentanglement-invariant transformations, allowing the sharing of
+disentanglement properties among spaces while maintaining an acceptable level
+of reconstruction performance. We demonstrate the effectiveness of DeVAE in
+achieving a balance between disentanglement and reconstruction through a series
+of experiments and ablation studies on dSprites and Shapes3D datasets. Code is
+available at https://github.com/erow/disentanglement_lib/tree/pytorch#devae.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Autoencoders Find Highly Interpretable Features in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08600v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08600v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoagy Cunningham, Aidan Ewart, Logan Riggs, Robert Huben, Lee Sharkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the roadblocks to a better understanding of neural networks' internals
+is \textit{polysemanticity}, where neurons appear to activate in multiple,
+semantically distinct contexts. Polysemanticity prevents us from identifying
+concise, human-understandable explanations for what neural networks are doing
+internally. One hypothesised cause of polysemanticity is
+\textit{superposition}, where neural networks represent more features than they
+have neurons by assigning features to an overcomplete set of directions in
+activation space, rather than to individual neurons. Here, we attempt to
+identify those directions, using sparse autoencoders to reconstruct the
+internal activations of a language model. These autoencoders learn sets of
+sparsely activating features that are more interpretable and monosemantic than
+directions identified by alternative approaches, where interpretability is
+measured by automated methods. Moreover, we show that with our learned set of
+features, we can pinpoint the features that are causally responsible for
+counterfactual behaviour on the indirect object identification task
+\citep{wang2022interpretability} to a finer degree than previous
+decompositions. This work indicates that it is possible to resolve
+superposition in language models using a scalable, unsupervised method. Our
+method may serve as a foundation for future mechanistic interpretability work,
+which we hope will enable greater model transparency and steerability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 18 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision-based DRL Autonomous Driving Agent with Sim2Real Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dianzhao Li, Ostap Okhrin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve fully autonomous driving, vehicles must be capable of continuously
+performing various driving tasks, including lane keeping and car following,
+both of which are fundamental and well-studied driving ones. However, previous
+studies have mainly focused on individual tasks, and car following tasks have
+typically relied on complete leader-follower information to attain optimal
+performance. To address this limitation, we propose a vision-based deep
+reinforcement learning (DRL) agent that can simultaneously perform lane keeping
+and car following maneuvers. To evaluate the performance of our DRL agent, we
+compare it with a baseline controller and use various performance metrics for
+quantitative analysis. Furthermore, we conduct a real-world evaluation to
+demonstrate the Sim2Real transfer capability of the trained DRL agent. To the
+best of our knowledge, our vision-based car following and lane keeping agent
+with Sim2Real transfer capability is the first of its kind.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fused Gromov-Wasserstein Graph Mixup for Graph-level Classifications <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15963v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15963v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Ma, Xu Chu, Yasha Wang, Yang Lin, Junfeng Zhao, Liantao Ma, Wenwu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph data augmentation has shown superiority in enhancing generalizability
+and robustness of GNNs in graph-level classifications. However, existing
+methods primarily focus on the augmentation in the graph signal space and the
+graph structure space independently, neglecting the joint interaction between
+them. In this paper, we address this limitation by formulating the problem as
+an optimal transport problem that aims to find an optimal inter-graph node
+matching strategy considering the interactions between graph structures and
+signals. To solve this problem, we propose a novel graph mixup algorithm called
+FGWMixup, which seeks a midpoint of source graphs in the Fused
+Gromov-Wasserstein (FGW) metric space. To enhance the scalability of our
+method, we introduce a relaxed FGW solver that accelerates FGWMixup by
+improving the convergence rate from $\mathcal{O}(t^{-1})$ to
+$\mathcal{O}(t^{-2})$. Extensive experiments conducted on five datasets using
+both classic (MPNNs) and advanced (Graphormers) GNN backbones demonstrate that
+FGWMixup effectively improves the generalizability and robustness of GNNs.
+Codes are available at https://github.com/ArthurLeoM/FGWMixup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Neural Factor Analysis for Disentangling Utterance-level
+  Speech Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08099v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08099v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiwei Lin, Chenhang He, Man-Wai Mak, Youzhi Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) speech models such as wav2vec and HuBERT have
+demonstrated state-of-the-art performance on automatic speech recognition (ASR)
+and proved to be extremely useful in low label-resource settings. However, the
+success of SSL models has yet to transfer to utterance-level tasks such as
+speaker, emotion, and language recognition, which still require supervised
+fine-tuning of the SSL models to obtain good performance. We argue that the
+problem is caused by the lack of disentangled representations and an
+utterance-level learning objective for these tasks. Inspired by how HuBERT uses
+clustering to discover hidden acoustic units, we formulate a factor analysis
+(FA) model that uses the discovered hidden acoustic units to align the SSL
+features. The underlying utterance-level representations are disentangled from
+the content of speech using probabilistic inference on the aligned features.
+Furthermore, the variational lower bound derived from the FA model provides an
+utterance-level objective, allowing error gradients to be backpropagated to the
+Transformer layers to learn highly discriminative acoustic units. When used in
+conjunction with HuBERT's masked prediction training, our models outperform the
+current best model, WavLM, on all utterance-level non-semantic tasks on the
+SUPERB benchmark with only 20% of labeled data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLOps for Scarce Image Data: A Use Case in Microscopic Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15521v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15521v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelo Yamachui Sitcheu, Nils Friederich, Simon Baeuerle, Oliver Neumann, Markus Reischl, Ralf Mikut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, Machine Learning (ML) is experiencing tremendous popularity that
+has never been seen before. The operationalization of ML models is governed by
+a set of concepts and methods referred to as Machine Learning Operations
+(MLOps). Nevertheless, researchers, as well as professionals, often focus more
+on the automation aspect and neglect the continuous deployment and monitoring
+aspects of MLOps. As a result, there is a lack of continuous learning through
+the flow of feedback from production to development, causing unexpected model
+deterioration over time due to concept drifts, particularly when dealing with
+scarce data. This work explores the complete application of MLOps in the
+context of scarce data analysis. The paper proposes a new holistic approach to
+enhance biomedical image analysis. Our method includes: a fingerprinting
+process that enables selecting the best models, datasets, and model development
+strategy relative to the image analysis task at hand; an automated model
+development stage; and a continuous deployment and monitoring process to ensure
+continuous learning. For preliminary results, we perform a proof of concept for
+fingerprinting in microscopic image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 5 figures , 33. Workshop on Computational Intelligence
+  Berlin Germany</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Series Continuous Modeling for Imputation and Forecasting with
+  Implicit Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05880v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05880v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Le Naour, Louis Serrano, Léon Migus, Yuan Yin, Ghislain Agoua, Nicolas Baskiotis, Patrick Gallinari, Vincent Guigue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel modeling approach for time series imputation and
+forecasting, tailored to address the challenges often encountered in real-world
+data, such as irregular samples, missing data, or unaligned measurements from
+multiple sensors. Our method relies on a continuous-time-dependent model of the
+series' evolution dynamics. It leverages adaptations of conditional, implicit
+neural representations for sequential data. A modulation mechanism, driven by a
+meta-learning algorithm, allows adaptation to unseen samples and extrapolation
+beyond observed time-windows for long-term predictions. The model provides a
+highly flexible and unified framework for imputation and forecasting tasks
+across a wide range of challenging scenarios. It achieves state-of-the-art
+performance on classical benchmarks and outperforms alternative time-continuous
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computational Complexity of Learning Neural Networks: Smoothness and
+  Degeneracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Daniely, Nathan Srebro, Gal Vardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding when neural networks can be learned efficiently is a
+fundamental question in learning theory. Existing hardness results suggest that
+assumptions on both the input distribution and the network's weights are
+necessary for obtaining efficient algorithms. Moreover, it was previously shown
+that depth-$2$ networks can be efficiently learned under the assumptions that
+the input distribution is Gaussian, and the weight matrix is non-degenerate. In
+this work, we study whether such assumptions may suffice for learning deeper
+networks and prove negative results. We show that learning depth-$3$ ReLU
+networks under the Gaussian input distribution is hard even in the
+smoothed-analysis framework, where a random noise is added to the network's
+parameters. It implies that learning depth-$3$ ReLU networks under the Gaussian
+distribution is hard even if the weight matrices are non-degenerate. Moreover,
+we consider depth-$2$ networks, and show hardness of learning in the
+smoothed-analysis framework, where both the network parameters and the input
+distribution are smoothed. Our hardness results are under a well-studied
+assumption on the existence of local pseudorandom generators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Changed the title, and made some other minor modifications. arXiv
+  admin note: text overlap with arXiv:2101.08303</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AKE-GNN: Effective Graph Learning with Adaptive Knowledge Exchange 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.05455v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.05455v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Zeng, Jin Xu, Zijun Yao, Yanqiao Zhu, Jian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have already been widely used in various graph
+mining tasks. However, recent works reveal that the learned weights (channels)
+in well-trained GNNs are highly redundant, which inevitably limits the
+performance of GNNs. Instead of removing these redundant channels for
+efficiency consideration, we aim to reactivate them to enlarge the
+representation capacity of GNNs for effective graph learning. In this paper, we
+propose to substitute these redundant channels with other informative channels
+to achieve this goal. We introduce a novel GNN learning framework named
+AKE-GNN, which performs the Adaptive Knowledge Exchange strategy among multiple
+graph views generated by graph augmentations. AKE-GNN first trains multiple
+GNNs each corresponding to one graph view to obtain informative channels. Then,
+AKE-GNN iteratively exchanges redundant channels in the weight parameter matrix
+of one GNN with informative channels of another GNN in a layer-wise manner.
+Additionally, existing GNNs can be seamlessly incorporated into our framework.
+AKE-GNN achieves superior performance compared with various baselines across a
+suite of experiments on node classification, link prediction, and graph
+classification. In particular, we conduct a series of experiments on 15 public
+benchmark datasets, 8 popular GNN models, and 3 graph tasks and show that
+AKE-GNN consistently outperforms existing popular GNN models and even their
+ensembles. Extensive ablation studies and analyses on knowledge exchange
+methods validate the effectiveness of AKE-GNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Neural Stochastic Differential Equations for Change Point
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.10317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.10317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artem Ryzhikov, Mikhail Hushchyn, Denis Derkach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated analysis of complex systems based on multiple readouts remains a
+challenge. Change point detection algorithms are aimed to locating abrupt
+changes in the time series behaviour of a process. In this paper, we present a
+novel change point detection algorithm based on Latent Neural Stochastic
+Differential Equations (SDE). Our method learns a non-linear deep learning
+transformation of the process into a latent space and estimates a SDE that
+describes its evolution over time. The algorithm uses the likelihood ratio of
+the learned stochastic processes in different timestamps to find change points
+of the process. We demonstrate the detection capabilities and performance of
+our algorithm on synthetic and real-world datasets. The proposed method
+outperforms the state-of-the-art algorithms on the majority of our experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Sliced MMD Flows with Riesz Kernels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11463v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11463v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Hertrich, Christian Wald, Fabian Altekrüger, Paul Hagemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maximum mean discrepancy (MMD) flows suffer from high computational costs in
+large scale computations. In this paper, we show that MMD flows with Riesz
+kernels $K(x,y) = - \Vert x-y\Vert^r$, $r \in (0,2)$ have exceptional
+properties which allow their efficient computation. We prove that the MMD of
+Riesz kernels, which is also known as energy distance, coincides with the MMD
+of their sliced version. As a consequence, the computation of gradients of MMDs
+can be performed in the one-dimensional setting. Here, for $r=1$, a simple
+sorting algorithm can be applied to reduce the complexity from $O(MN+N^2)$ to
+$O((M+N)\log(M+N))$ for two measures with $M$ and $N$ support points. As
+another interesting follow-up result, the MMD of compactly supported measures
+can be estimated from above and below by the Wasserstein-1 distance. For the
+implementations we approximate the gradient of the sliced MMD by using only a
+finite number $P$ of slices. We show that the resulting error has complexity
+$O(\sqrt{d/P})$, where $d$ is the data dimension. These results enable us to
+train generative models by approximating MMD gradient flows by neural networks
+even for image applications. We demonstrate the efficiency of our model by
+image generation on MNIST, FashionMNIST and CIFAR10.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Biologically Plausible Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matilde Tristany Farinha, Thomas Ortner, Giorgia Dellaferrera, Benjamin Grewe, Angeliki Pantazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Neural Networks (ANNs) trained with Backpropagation (BP) show
+astounding performance and are increasingly often used in performing our daily
+life tasks. However, ANNs are highly vulnerable to adversarial attacks, which
+alter inputs with small targeted perturbations that drastically disrupt the
+models' performance. The most effective method to make ANNs robust against
+these attacks is adversarial training, in which the training dataset is
+augmented with exemplary adversarial samples. Unfortunately, this approach has
+the drawback of increased training complexity since generating adversarial
+samples is very computationally demanding. In contrast to ANNs, humans are not
+susceptible to adversarial attacks. Therefore, in this work, we investigate
+whether biologically-plausible learning algorithms are more robust against
+adversarial attacks than BP. In particular, we present an extensive comparative
+analysis of the adversarial robustness of BP and Present the Error to Perturb
+the Input To modulate Activity (PEPITA), a recently proposed
+biologically-plausible learning algorithm, on various computer vision tasks. We
+observe that PEPITA has higher intrinsic adversarial robustness and, with
+adversarial training, has a more favourable natural-vs-adversarial performance
+trade-off as, for the same natural accuracies, PEPITA's adversarial accuracies
+decrease in average by 0.26% and BP's by 8.05%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preemptively Pruning Clever-Hans Strategies in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenz Linhardt, Klaus-Robert Müller, Grégoire Montavon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI has become a popular tool for validating machine learning
+models. Mismatches between the explained model's decision strategy and the
+user's domain knowledge (e.g. Clever Hans effects) have also been recognized as
+a starting point for improving faulty models. However, it is less clear what to
+do when the user and the explanation agree. In this paper, we demonstrate that
+acceptance of explanations by the user is not a guarantee for a machine
+learning model to function well, in particular, some Clever Hans effects may
+remain undetected. Such hidden flaws of the model can nevertheless be
+mitigated, and we demonstrate this by contributing a new method,
+Explanation-Guided Exposure Minimization (EGEM), that preemptively prunes
+variations in the ML model that have not been the subject of positive
+explanation feedback. Experiments on natural image data demonstrate that our
+approach leads to models that strongly reduce their reliance on hidden Clever
+Hans strategies, and consequently achieve higher accuracy on new data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages + supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06991v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06991v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zike Wu, Pan Zhou, Kenji Kawaguchi, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) have been adopted across diverse fields with its
+remarkable abilities in capturing intricate data distributions. In this paper,
+we propose a Fast Diffusion Model (FDM) to significantly speed up DMs from a
+stochastic optimization perspective for both faster training and sampling. We
+first find that the diffusion process of DMs accords with the stochastic
+optimization process of stochastic gradient descent (SGD) on a stochastic
+time-variant problem. Then, inspired by momentum SGD that uses both gradient
+and an extra momentum to achieve faster and more stable convergence than SGD,
+we integrate momentum into the diffusion process of DMs. This comes with a
+unique challenge of deriving the noise perturbation kernel from the
+momentum-based diffusion process. To this end, we frame the process as a Damped
+Oscillation system whose critically damped state -- the kernel solution --
+avoids oscillation and yields a faster convergence speed of the diffusion
+process. Empirical results show that our FDM can be applied to several popular
+DM frameworks, e.g., VP, VE, and EDM, and reduces their training cost by about
+50% with comparable image synthesis performance on CIFAR-10, FFHQ, and AFHQv2
+datasets. Moreover, FDM decreases their sampling steps by about 3x to achieve
+similar performance under the same samplers. The code is available at
+https://github.com/sail-sg/FDM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leaping through tree space: continuous phylogenetic inference for rooted
+  and unrooted trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05739v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05739v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J Penn, Neil Scheidwasser, Joseph Penn, Christl A Donnelly, David A Duchêne, Samir Bhatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phylogenetics is now fundamental in life sciences, providing insights into
+the earliest branches of life and the origins and spread of epidemics. However,
+finding suitable phylogenies from the vast space of possible trees remains
+challenging. To address this problem, for the first time, we perform both tree
+exploration and inference in a continuous space where the computation of
+gradients is possible. This continuous relaxation allows for major leaps across
+tree space in both rooted and unrooted trees, and is less susceptible to
+convergence to local minima. Our approach outperforms the current best methods
+for inference on unrooted trees and, in simulation, accurately infers the tree
+and root in ultrametric cases. The approach is effective in cases of empirical
+data with negligible amounts of data, which we demonstrate on the phylogeny of
+jawed vertebrates. Indeed, only a few genes with an ultrametric signal were
+generally sufficient for resolving the major lineages of vertebrate. With
+cubic-time complexity and efficient optimisation via automatic differentiation,
+our method presents an effective way forwards for exploring the most difficult,
+data-deficient phylogenetic questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures, 15 supplementary pages, 3 supplementary figures;
+  overhaul of Methods and Results sections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Technical report: Graph Neural Networks go Grammatical 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01590v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01590v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Piquenot, Aldo Moscatelli, Maxime Bérar, Pierre Héroux, Romain raveaux, Jean-Yves Ramel, Sébastien Adam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a framework for formally establishing a connection
+between a portion of an algebraic language and a Graph Neural Network (GNN).
+The framework leverages Context-Free Grammars (CFG) to organize algebraic
+operations into generative rules that can be translated into a GNN layer model.
+As CFGs derived directly from a language tend to contain redundancies in their
+rules and variables, we present a grammar reduction scheme. By applying this
+strategy, we define a CFG that conforms to the third-order Weisfeiler-Lehman
+(3-WL) test using MATLANG. From this 3-WL CFG, we derive a GNN model, named
+G$^2$N$^2$, which is provably 3-WL compliant. Through various experiments, we
+demonstrate the superior efficiency of G$^2$N$^2$ compared to other 3-WL GNNs
+across numerous downstream tasks. Specifically, one experiment highlights the
+benefits of grammar reduction within our framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Benchmark Generative Probabilistic Model for Weak Supervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios Papadopoulos, Fran Silavong, Sean Moran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding relevant and high-quality datasets to train machine learning models
+is a major bottleneck for practitioners. Furthermore, to address ambitious
+real-world use-cases there is usually the requirement that the data come
+labelled with high-quality annotations that can facilitate the training of a
+supervised model. Manually labelling data with high-quality labels is generally
+a time-consuming and challenging task and often this turns out to be the
+bottleneck in a machine learning project. Weak Supervised Learning (WSL)
+approaches have been developed to alleviate the annotation burden by offering
+an automatic way of assigning approximate labels (pseudo-labels) to unlabelled
+data based on heuristics, distant supervision and knowledge bases. We apply
+probabilistic generative latent variable models (PLVMs), trained on heuristic
+labelling representations of the original dataset, as an accurate, fast and
+cost-effective way to generate pseudo-labels. We show that the PLVMs achieve
+state-of-the-art performance across four datasets. For example, they achieve
+22% points higher F1 score than Snorkel in the class-imbalanced Spouse dataset.
+PLVMs are plug-and-playable and are a drop-in replacement to existing WSL
+frameworks (e.g. Snorkel) or they can be used as benchmark models for more
+complicated algorithms, giving practitioners a compelling accuracy boost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradual Domain Adaptation via Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11492v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11492v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shogo Sagawa, Hideitsu Hino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard domain adaptation methods do not work well when a large gap exists
+between the source and target domains. Gradual domain adaptation is one of the
+approaches used to address the problem. It involves leveraging the intermediate
+domain, which gradually shifts from the source domain to the target domain. In
+previous work, it is assumed that the number of intermediate domains is large
+and the distance between adjacent domains is small; hence, the gradual domain
+adaptation algorithm, involving self-training with unlabeled datasets, is
+applicable. In practice, however, gradual self-training will fail because the
+number of intermediate domains is limited and the distance between adjacent
+domains is large. We propose the use of normalizing flows to deal with this
+problem while maintaining the framework of unsupervised domain adaptation. The
+proposed method learns a transformation from the distribution of the target
+domain to the Gaussian mixture distribution via the source domain. We evaluate
+our proposed method by experiments using real-world datasets and confirm that
+it mitigates the above-explained problem and improves the classification
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Examining Computational Performance of Unsupervised Concept Drift
+  Detection: A <span class="highlight-title">Survey</span> and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08319v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08319v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Werner, Nishant Kumar, Matthias Lieber, Sunna Torge, Stefan Gumhold, Wolfgang E. Nagel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept drift detection is crucial for many AI systems to ensure the system's
+reliability. These systems often have to deal with large amounts of data or
+react in real time. Thus, drift detectors must meet computational requirements
+or constraints with a comprehensive performance evaluation. However, so far,
+the focus of developing drift detectors is on detection quality, e.g.~accuracy,
+but not on computational performance, such as running time. We show that the
+previous works consider computational performance only as a secondary objective
+and do not have a benchmark for such evaluation. Hence, we propose a set of
+metrics that considers both, computational performance and detection quality.
+Among others, our set of metrics includes the Relative Runtime Overhead RRO to
+evaluate a drift detector's computational impact on an AI system. This work
+focuses on unsupervised drift detectors, not being restricted to the
+availability of labeled data. We measure the computational performance based on
+the RRO and memory consumption of four available unsupervised drift detectors
+on five different data sets. The range of the RRO reaches from 1.01 to 20.15.
+Moreover, we measure state-of-the-art detection quality metrics to discuss our
+evaluation results and show the necessity of thorough computational performance
+considerations for drift detectors. Additionally, we highlight and explain
+requirements for a comprehensive benchmark of drift detectors. Our
+investigations can also be extended for supervised drift detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-Located Human-Human Interaction Analysis using Nonverbal Cues: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cigdem Beyan, Alessandro Vinciarelli, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated co-located human-human interaction analysis has been addressed by
+the use of nonverbal communication as measurable evidence of social and
+psychological phenomena. We survey the computing studies (since 2010) detecting
+phenomena related to social traits (e.g., leadership, dominance, personality
+traits), social roles/relations, and interaction dynamics (e.g., group
+cohesion, engagement, rapport). Our target is to identify the nonverbal cues
+and computational methodologies resulting in effective performance. This survey
+differs from its counterparts by involving the widest spectrum of social
+phenomena and interaction settings (free-standing conversations, meetings,
+dyads, and crowds). We also present a comprehensive summary of the related
+datasets and outline future research directions which are regarding the
+implementation of artificial intelligence, dataset curation, and
+privacy-preserving interaction analysis. Some major observations are: the most
+often used nonverbal cue, computational method, interaction environment, and
+sensing approach are speaking activity, support vector machines, and meetings
+composed of 3-4 persons equipped with microphones and cameras, respectively;
+multimodal features are prominently performing better; deep learning
+architectures showed improved performance in overall, but there exist many
+phenomena whose detection has never been implemented through deep models. We
+also identified several limitations such as the lack of scalable benchmarks,
+annotation reliability tests, cross-dataset experiments, and explainability
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. It is posted here for your
+  personal use. Not for redistribution. The definitive version was published in
+  ACM Computing Surveys, https://doi.org/10.1145/3626516</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Gradient Langevin Dynamics Based on Quantization with
+  Increasing Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JInwuk Seok, Changsik Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic learning dynamics based on Langevin or Levy stochastic
+differential equations (SDEs) in deep neural networks control the variance of
+noise by varying the size of the mini-batch or directly those of injecting
+noise. Since the noise variance affects the approximation performance, the
+design of the additive noise is significant in SDE-based learning and practical
+implementation. In this paper, we propose an alternative stochastic descent
+learning equation based on quantized optimization for non-convex objective
+functions, adopting a stochastic analysis perspective. The proposed method
+employs a quantized optimization approach that utilizes Langevin SDE dynamics,
+allowing for controllable noise with an identical distribution without the need
+for additive noise or adjusting the mini-batch size. Numerical experiments
+demonstrate the effectiveness of the proposed algorithm on vanilla convolution
+neural network(CNN) models and the ResNet-50 architecture across various data
+sets. Furthermore, we provide a simple PyTorch implementation of the proposed
+algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Break-A-Scene: Extracting Multiple Concepts from a Single Image <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16311v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16311v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Avrahami, Kfir Aberman, Ohad Fried, Daniel Cohen-Or, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image model personalization aims to introduce a user-provided concept
+to the model, allowing its synthesis in diverse contexts. However, current
+methods primarily focus on the case of learning a single concept from multiple
+images with variations in backgrounds and poses, and struggle when adapted to a
+different scenario. In this work, we introduce the task of textual scene
+decomposition: given a single image of a scene that may contain several
+concepts, we aim to extract a distinct text token for each concept, enabling
+fine-grained control over the generated scenes. To this end, we propose
+augmenting the input image with masks that indicate the presence of target
+concepts. These masks can be provided by the user or generated automatically by
+a pre-trained segmentation model. We then present a novel two-phase
+customization process that optimizes a set of dedicated textual embeddings
+(handles), as well as the model weights, striking a delicate balance between
+accurately capturing the concepts and avoiding overfitting. We employ a masked
+diffusion loss to enable handles to generate their assigned concepts,
+complemented by a novel loss on cross-attention maps to prevent entanglement.
+We also introduce union-sampling, a training strategy aimed to improve the
+ability of combining multiple concepts in generated images. We use several
+automatic metrics to quantitatively compare our method against several
+baselines, and further affirm the results using a user study. Finally, we
+showcase several applications of our method. Project page is available at:
+https://omriavrahami.com/break-a-scene/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023. Project page: at:
+  https://omriavrahami.com/break-a-scene/ Video:
+  https://www.youtube.com/watch?v=-9EA-BhizgM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Foundation Model for General Moving Object Segmentation in Medical
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongnuo Yan, Tong Han, Yuhao Huang, Lian Liu, Han Zhou, Jiongquan Chen, Wenlong Shi, Yan Cao, Xin Yang, Dong Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation aims to delineate the anatomical or pathological
+structures of interest, playing a crucial role in clinical diagnosis. A
+substantial amount of high-quality annotated data is crucial for constructing
+high-precision deep segmentation models. However, medical annotation is highly
+cumbersome and time-consuming, especially for medical videos or 3D volumes, due
+to the huge labeling space and poor inter-frame consistency. Recently, a
+fundamental task named Moving Object Segmentation (MOS) has made significant
+advancements in natural images. Its objective is to delineate moving objects
+from the background within image sequences, requiring only minimal annotations.
+In this paper, we propose the first foundation model, named iMOS, for MOS in
+medical images. Extensive experiments on a large multi-modal medical dataset
+validate the effectiveness of the proposed iMOS. Specifically, with the
+annotation of only a small number of images in the sequence, iMOS can achieve
+satisfactory tracking and segmentation performance of moving objects throughout
+the entire sequence in bi-directions. We hope that the proposed iMOS can help
+accelerate the annotation speed of experts, and boost the development of
+medical foundation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PharmacoNet: Accelerating Large-Scale Virtual Screening by Deep
+  Pharmacophore Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghwan Seo, Woo Youn Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the size of accessible compound libraries expands to over 10 billion, the
+need for more efficient structure-based virtual screening methods is emerging.
+Different pre-screening methods have been developed to rapidly screen the
+library, but the structure-based methods applicable to general proteins are
+still lacking: the challenge is to predict the binding pose between proteins
+and ligands and perform scoring in an extremely short time. We introduce
+PharmacoNet, a deep learning framework that identifies the optimal 3D
+pharmacophore arrangement which a ligand should have for stable binding from
+the binding site. By coarse-grained graph matching between ligands and the
+generated pharmacophore arrangement, we solve the expensive binding pose
+sampling and scoring procedures of existing methods in a single step.
+PharmacoNet is significantly faster than state-of-the-art structure-based
+approaches, yet reasonably accurate with a simple scoring function.
+Furthermore, we show the promising result that PharmacoNet effectively retains
+hit candidates even under the high pre-screening filtration rates. Overall, our
+study uncovers the hitherto untapped potential of a pharmacophore modeling
+approach in deep learning-based drug discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Blending Imitation and Reinforcement Learning for Robust Policy
+  Improvement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuefeng Liu, Takuma Yoneda, Rick L. Stevens, Matthew R. Walter, Yuxin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While reinforcement learning (RL) has shown promising performance, its sample
+complexity continues to be a substantial hurdle, restricting its broader
+application across a variety of domains. Imitation learning (IL) utilizes
+oracles to improve sample efficiency, yet it is often constrained by the
+quality of the oracles deployed. which actively interleaves between IL and RL
+based on an online estimate of their performance. RPI draws on the strengths of
+IL, using oracle queries to facilitate exploration, an aspect that is notably
+challenging in sparse-reward RL, particularly during the early stages of
+learning. As learning unfolds, RPI gradually transitions to RL, effectively
+treating the learned policy as an improved oracle. This algorithm is capable of
+learning from and improving upon a diverse set of black-box oracles. Integral
+to RPI are Robust Active Policy Selection (RAPS) and Robust Policy Gradient
+(RPG), both of which reason over whether to perform state-wise imitation from
+the oracles or learn from its own value function when the learner's performance
+surpasses that of the oracles in a specific state. Empirical evaluations and
+theoretical analysis validate that RPI excels in comparison to existing
+state-of-the-art methodologies, demonstrating superior performance across
+various benchmark domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoNO: Complex Neural Operator for Continuous Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn Tiwari, N M Anoop Krishnan, Prathosh A P
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural operators extend data-driven models to map between
+infinite-dimensional functional spaces. These models have successfully solved
+continuous dynamical systems represented by differential equations, viz weather
+forecasting, fluid flow, or solid mechanics. However, the existing operators
+still rely on real space, thereby losing rich representations potentially
+captured in the complex space by functional transforms. In this paper, we
+introduce a Complex Neural Operator (CoNO), that parameterizes the integral
+kernel in the complex fractional Fourier domain. Additionally, the model
+employing a complex-valued neural network along with aliasing-free activation
+functions preserves the complex values and complex algebraic properties,
+thereby enabling improved representation, robustness to noise, and
+generalization. We show that the model effectively captures the underlying
+partial differential equation with a single complex fractional Fourier
+transform. We perform an extensive empirical evaluation of CoNO on several
+datasets and additional tasks such as zero-shot super-resolution, evaluation of
+out-of-distribution data, data efficiency, and robustness to noise. CoNO
+exhibits comparable or superior performance to all the state-of-the-art models
+in these tasks. Altogether, CoNO presents a robust and superior model for
+modeling continuous dynamical systems, providing a fillip to scientific machine
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Modeling through the Semi-dual Formulation of Unbalanced
+  Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemoo Choi, Jaewoong Choi, Myungjoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) problem investigates a transport map that bridges two
+distributions while minimizing a given cost function. In this regard, OT
+between tractable prior distribution and data has been utilized for generative
+modeling tasks. However, OT-based methods are susceptible to outliers and face
+optimization challenges during training. In this paper, we propose a novel
+generative model based on the semi-dual formulation of Unbalanced Optimal
+Transport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution
+matching. This approach provides better robustness against outliers, stability
+during training, and faster convergence. We validate these properties
+empirically through experiments. Moreover, we study the theoretical upper-bound
+of divergence between distributions in UOT. Our model outperforms existing
+OT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 5.80
+on CelebA-HQ-256. The code is available at
+\url{https://github.com/Jae-Moo/UOTM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Frailty Machine: Beyond proportional hazard assumption in neural
+  survival regressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruofan Wu, Jiawei Qiao, Mingzhe Wu, Wen Yu, Ming Zheng, Tengfei Liu, Tianyi Zhang, Weiqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present neural frailty machine (NFM), a powerful and flexible neural
+modeling framework for survival regressions. The NFM framework utilizes the
+classical idea of multiplicative frailty in survival analysis to capture
+unobserved heterogeneity among individuals, at the same time being able to
+leverage the strong approximation power of neural architectures for handling
+nonlinear covariate dependence. Two concrete models are derived under the
+framework that extends neural proportional hazard models and nonparametric
+hazard regression models. Both models allow efficient training under the
+likelihood objective. Theoretically, for both proposed models, we establish
+statistical guarantees of neural function approximation with respect to
+nonparametric components via characterizing their rate of convergence.
+Empirically, we provide synthetic experiments that verify our theoretical
+statements. We also conduct experimental evaluations over $6$ benchmark
+datasets of different scales, showing that the proposed NFM models outperform
+state-of-the-art survival models in terms of predictive performance. Our code
+is publicly availabel at https://github.com/Rorschach1989/nfm
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FiGURe: Simple and Efficient Unsupervised Node Representations with
+  Filter Augmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01892v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01892v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanakya Ekbote, Ajinkya Pankaj Deshpande, Arun Iyer, Ramakrishna Bairi, Sundararajan Sellamanickam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised node representations learnt using contrastive learning-based
+methods have shown good performance on downstream tasks. However, these methods
+rely on augmentations that mimic low-pass filters, limiting their performance
+on tasks requiring different eigen-spectrum parts. This paper presents a simple
+filter-based augmentation method to capture different parts of the
+eigen-spectrum. We show significant improvements using these augmentations.
+Further, we show that sharing the same weights across these different filter
+augmentations is possible, reducing the computational load. In addition,
+previous works have shown that good performance on downstream tasks requires
+high dimensional representations. Working with high dimensions increases the
+computations, especially when multiple augmentations are involved. We mitigate
+this problem and recover good performance through lower dimensional embeddings
+using simple random Fourier feature projections. Our method, FiGURe achieves an
+average gain of up to 4.4%, compared to the state-of-the-art unsupervised
+models, across all datasets in consideration, both homophilic and heterophilic.
+Our code can be found at: https://github.com/microsoft/figure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft Convex Quantization: Revisiting Vector Quantization with Convex
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Gautam, Reid Pryzant, Ziyi Yang, Chenguang Zhu, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vector Quantization (VQ) is a well-known technique in deep learning for
+extracting informative discrete latent representations. VQ-embedded models have
+shown impressive results in a range of applications including image and speech
+generation. VQ operates as a parametric K-means algorithm that quantizes inputs
+using a single codebook vector in the forward pass. While powerful, this
+technique faces practical challenges including codebook collapse,
+non-differentiability and lossy compression. To mitigate the aforementioned
+issues, we propose Soft Convex Quantization (SCQ) as a direct substitute for
+VQ. SCQ works like a differentiable convex optimization (DCO) layer: in the
+forward pass, we solve for the optimal convex combination of codebook vectors
+that quantize the inputs. In the backward pass, we leverage differentiability
+through the optimality conditions of the forward solution. We then introduce a
+scalable relaxation of the SCQ optimization and demonstrate its efficacy on the
+CIFAR-10, GTSRB and LSUN datasets. We train powerful SCQ autoencoder models
+that significantly outperform matched VQ-based architectures, observing an
+order of magnitude better image reconstruction and codebook usage with
+comparable quantization runtime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Land-cover change detection using paired OpenStreetMap data and optical
+  high-resolution imagery via object-guided <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongruixuan Chen, Cuiling Lan, Jian Song, Clifford Broni-Bediako, Junshi Xia, Naoto Yokoya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical high-resolution imagery and OpenStreetMap (OSM) data are two
+important data sources for land-cover change detection. Previous studies in
+these two data sources focus on utilizing the information in OSM data to aid
+the change detection on multi-temporal optical high-resolution images. This
+paper pioneers the direct detection of land-cover changes utilizing paired OSM
+data and optical imagery, thereby broadening the horizons of change detection
+tasks to encompass more dynamic earth observations. To this end, we propose an
+object-guided Transformer (ObjFormer) architecture by naturally combining the
+prevalent object-based image analysis (OBIA) technique with the advanced vision
+Transformer architecture. The introduction of OBIA can significantly reduce the
+computational overhead and memory burden in the self-attention module.
+Specifically, the proposed ObjFormer has a hierarchical pseudo-siamese encoder
+consisting of object-guided self-attention modules that extract representative
+features of different levels from OSM data and optical images; a decoder
+consisting of object-guided cross-attention modules can progressively recover
+the land-cover changes from the extracted heterogeneous features. In addition
+to the basic supervised binary change detection task, this paper raises a new
+semi-supervised semantic change detection task that does not require any
+manually annotated land-cover labels of optical images to train semantic change
+detectors. Two lightweight semantic decoders are added to ObjFormer to
+accomplish this task efficiently. A converse cross-entropy loss is designed to
+fully utilize the negative samples, thereby contributing to the great
+performance improvement in this task. The first large-scale benchmark dataset
+containing 1,287 map-image pairs (1024$\times$ 1024 pixels for each sample)
+covering 40 regions on six continents ...(see the manuscript for the full
+abstract)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time
+  Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryan Bo Cao, Abrar Alali, Hansi Liu, Nicholas Meegan, Marco Gruteser, Kristin Dana, Ashwin Ashok, Shubham Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking subjects in videos is one of the most widely used functions in
+camera-based IoT applications such as security surveillance, smart city traffic
+safety enhancement, vehicle to pedestrian communication and so on. In the
+computer vision domain, tracking is usually achieved by first detecting
+subjects with bounding boxes, then associating detected bounding boxes across
+video frames. For many IoT systems, images captured by cameras are usually sent
+over the network to be processed at a different site that has more powerful
+computing resources than edge devices. However, sending entire frames through
+the network causes significant bandwidth consumption that may exceed the system
+bandwidth constraints. To tackle this problem, we propose ViFiT, a
+transformer-based model that reconstructs vision bounding box trajectories from
+phone data (IMU and Fine Time Measurements). It leverages a transformer ability
+of better modeling long-term time series data. ViFiT is evaluated on Vi-Fi
+Dataset, a large-scale multimodal dataset in 5 diverse real world scenes,
+including indoor and outdoor environments. To fill the gap of proper metrics of
+jointly capturing the system characteristics of both tracking quality and video
+bandwidth reduction, we propose a novel evaluation framework dubbed Minimum
+Required Frames (MRF) and Minimum Required Frames Ratio (MRFR). ViFiT achieves
+an MRFR of 0.65 that outperforms the state-of-the-art approach for cross-modal
+reconstruction in LSTM Encoder-Decoder architecture X-Translator of 0.98,
+resulting in a high frame reduction rate as 97.76%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 12 figures, 9 tables. MobiCom 2023 ISACom</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-Located Human-Human Interaction Analysis using Nonverbal Cues: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cigdem Beyan, Alessandro Vinciarelli, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated co-located human-human interaction analysis has been addressed by
+the use of nonverbal communication as measurable evidence of social and
+psychological phenomena. We survey the computing studies (since 2010) detecting
+phenomena related to social traits (e.g., leadership, dominance, personality
+traits), social roles/relations, and interaction dynamics (e.g., group
+cohesion, engagement, rapport). Our target is to identify the nonverbal cues
+and computational methodologies resulting in effective performance. This survey
+differs from its counterparts by involving the widest spectrum of social
+phenomena and interaction settings (free-standing conversations, meetings,
+dyads, and crowds). We also present a comprehensive summary of the related
+datasets and outline future research directions which are regarding the
+implementation of artificial intelligence, dataset curation, and
+privacy-preserving interaction analysis. Some major observations are: the most
+often used nonverbal cue, computational method, interaction environment, and
+sensing approach are speaking activity, support vector machines, and meetings
+composed of 3-4 persons equipped with microphones and cameras, respectively;
+multimodal features are prominently performing better; deep learning
+architectures showed improved performance in overall, but there exist many
+phenomena whose detection has never been implemented through deep models. We
+also identified several limitations such as the lack of scalable benchmarks,
+annotation reliability tests, cross-dataset experiments, and explainability
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. It is posted here for your
+  personal use. Not for redistribution. The definitive version was published in
+  ACM Computing Surveys, https://doi.org/10.1145/3626516</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalize Segment Anything Model with One Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renrui Zhang, Zhengkai Jiang, Ziyu Guo, Shilin Yan, Junting Pan, Xianzheng Ma, Hao Dong, Peng Gao, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by large-data pre-training, Segment Anything Model (SAM) has been
+demonstrated as a powerful and promptable framework, revolutionizing the
+segmentation models. Despite the generality, customizing SAM for specific
+visual concepts without man-powered prompting is under explored, e.g.,
+automatically segmenting your pet dog in different images. In this paper, we
+propose a training-free Personalization approach for SAM, termed as PerSAM.
+Given only a single image with a reference mask, PerSAM first localizes the
+target concept by a location prior, and segments it within other images or
+videos via three techniques: target-guided attention, target-semantic
+prompting, and cascaded post-refinement. In this way, we effectively adapt SAM
+for private use without any training. To further alleviate the mask ambiguity,
+we present an efficient one-shot fine-tuning variant, PerSAM-F. Freezing the
+entire SAM, we introduce two learnable weights for multi-scale masks, only
+training 2 parameters within 10 seconds for improved performance. To
+demonstrate our efficacy, we construct a new segmentation dataset, PerSeg, for
+personalized evaluation, and test our methods on video object segmentation with
+competitive performance. Besides, our approach can also enhance DreamBooth to
+personalize Stable Diffusion for text-to-image generation, which discards the
+background disturbance for better target appearance learning. Code is released
+at https://github.com/ZrrSkywalker/Personalize-SAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/ZrrSkywalker/Personalize-SAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-03T00:00:00Z">2023-10-03</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">96</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Post-training Large Language Models on Data Curriculum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canwen Xu, Corby Rosset, Luciano Del Corro, Shweti Mahajan, Julian McAuley, Jennifer Neville, Ahmed Hassan Awadallah, Nikhil Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alignment serves as an important step to steer large language models (LLMs)
+towards human preferences. In this paper, we explore contrastive post-training
+techniques for alignment by automatically constructing preference pairs from
+multiple models of varying strengths (e.g., InstructGPT, ChatGPT and GPT-4). We
+carefully compare the contrastive techniques of SLiC and DPO to SFT baselines
+and find that DPO provides a step-function improvement even after continueing
+SFT saturates. We also explore a data curriculum learning scheme for
+contrastive post-training, which starts by learning from "easier" pairs and
+transitioning to "harder" ones, which further improves alignment. Finally, we
+scale up our experiments to train with more data and larger models like Orca.
+Remarkably, contrastive post-training further improves the performance of Orca,
+already a state-of-the-art instruction learning model tuned with GPT-4 outputs,
+to exceed that of ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizable Long-Horizon Manipulations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhou, Mingyu Ding, Weikun Peng, Masayoshi Tomizuka, Lin Shao, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a framework harnessing the capabilities of Large
+Language Models (LLMs) to generate primitive task conditions for generalizable
+long-horizon manipulations with novel objects and unseen tasks. These task
+conditions serve as guides for the generation and adjustment of Dynamic
+Movement Primitives (DMP) trajectories for long-horizon task execution. We
+further create a challenging robotic manipulation task suite based on Pybullet
+for long-horizon task evaluation. Extensive experiments in both simulated and
+real-world environments demonstrate the effectiveness of our framework on both
+familiar tasks involving new objects and novel but related tasks, highlighting
+the potential of LLMs in enhancing robotic system versatility and adaptability.
+Project website: https://object814.github.io/Task-Condition-With-LLM/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathVista: Evaluating Mathematical Reasoning of Foundation Models in
+  Visual Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Large Language Models (LLMs) and Large Multimodal Models (LMMs)
+exhibit impressive skills in various domains, their ability for mathematical
+reasoning within visual contexts has not been formally examined. Equipping LLMs
+and LMMs with this capability is vital for general-purpose AI assistants and
+showcases promising potential in education, data analysis, and scientific
+discovery. To bridge this gap, we present MathVista, a benchmark designed to
+amalgamate challenges from diverse mathematical and visual tasks. We first
+taxonomize the key task types, reasoning skills, and visual contexts from the
+literature to guide our selection from 28 existing math-focused and visual
+question answering datasets. Then, we construct three new datasets, IQTest,
+FunctionQA, and PaperQA, to accommodate for missing types of visual contexts.
+The problems featured often require deep visual understanding beyond OCR or
+image captioning, and compositional reasoning with rich domain-specific tools,
+thus posing a notable challenge to existing models. We conduct a comprehensive
+evaluation of 11 prominent open-source and proprietary foundation models (LLMs,
+LLMs augmented with tools, and LMMs), and early experiments with GPT-4V. The
+best-performing model, Multimodal Bard, achieves only 58% of human performance
+(34.8% vs 60.3%), indicating ample room for further improvement. Given this
+significant gap, MathVista fuels future research in the development of
+general-purpose AI agents capable of tackling mathematically intensive and
+visually rich real-world tasks. Preliminary tests show that MathVista also
+presents challenges to GPT-4V, underscoring the benchmark's importance. The
+project is available at https://mathvista.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 56 figures. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing <span class="highlight-title">Pre-Train</span>ed Sentence <span class="highlight-title">Transformer</span>s for Offensive Language
+  Detection in Indian Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ananya Joshi, Raviraj Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In our increasingly interconnected digital world, social media platforms have
+emerged as powerful channels for the dissemination of hate speech and offensive
+content. This work delves into the domain of hate speech detection, placing
+specific emphasis on three low-resource Indian languages: Bengali, Assamese,
+and Gujarati. The challenge is framed as a text classification task, aimed at
+discerning whether a tweet contains offensive or non-offensive content.
+Leveraging the HASOC 2023 datasets, we fine-tuned pre-trained BERT and SBERT
+models to evaluate their effectiveness in identifying hate speech. Our findings
+underscore the superiority of monolingual sentence-BERT models, particularly in
+the Bengali language, where we achieved the highest ranking. However, the
+performance in Assamese and Gujarati languages signifies ongoing opportunities
+for enhancement. Our goal is to foster inclusive online spaces by countering
+hate speech proliferation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>HASOC at FIRE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who's Harry Potter? Approximate Unlearning in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronen Eldan, Mark Russinovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are trained on massive internet corpora that
+often contain copyrighted content. This poses legal and ethical challenges for
+the developers and users of these models, as well as the original authors and
+publishers. In this paper, we propose a novel technique for unlearning a subset
+of the training data from a LLM, without having to retrain it from scratch.
+  We evaluate our technique on the task of unlearning the Harry Potter books
+from the Llama2-7b model (a generative language model recently open-sourced by
+Meta). While the model took over 184K GPU-hours to pretrain, we show that in
+about 1 GPU hour of finetuning, we effectively erase the model's ability to
+generate or recall Harry Potter-related content, while its performance on
+common benchmarks (such as Winogrande, Hellaswag, arc, boolq and piqa) remains
+almost unaffected. We make our fine-tuned model publicly available on
+HuggingFace for community evaluation. To the best of our knowledge, this is the
+first paper to present an effective technique for unlearning in generative
+language models.
+  Our technique consists of three main components: First, we use a reinforced
+model that is further trained on the target data to identify the tokens that
+are most related to the unlearning target, by comparing its logits with those
+of a baseline model. Second, we replace idiosyncratic expressions in the target
+data with generic counterparts, and leverage the model's own predictions to
+generate alternative labels for every token. These labels aim to approximate
+the next-token predictions of a model that has not been trained on the target
+data. Third, we finetune the model on these alternative labels, which
+effectively erases the original text from the model's memory whenever it is
+prompted with its context.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Quality Assessment of Wikipedia Articles -- A Systematic
+  Literature <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Miguel Moás, Carla Teixeira Lopes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wikipedia is the world's largest online encyclopedia, but maintaining article
+quality through collaboration is challenging. Wikipedia designed a quality
+scale, but with such a manual assessment process, many articles remain
+unassessed. We review existing methods for automatically measuring the quality
+of Wikipedia articles, identifying and comparing machine learning algorithms,
+article features, quality metrics, and used datasets, examining 149 distinct
+studies, and exploring commonalities and gaps in them. The literature is
+extensive, and the approaches follow past technological trends. However,
+machine learning is still not widely used by Wikipedia, and we hope that our
+analysis helps future researchers change that reality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 10 figures, just accepted in ACM Computing Surveys
+  (September 2023). This is the author's version of the work. It is posted here
+  for your personal use. Not for redistribution. The definitive Version of
+  Record was published in ACM Computing Surveys,
+  https://dx.doi.org/10.1145/3625286</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extraction of Medication and Temporal Relation from Clinical Text by
+  Harnessing Different Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangyu Tu, Lifeng Han, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical texts, represented in electronic medical records (EMRs), contain
+rich medical information and are essential for disease prediction, personalised
+information recommendation, clinical decision support, and medication pattern
+mining and measurement. Relation extractions between medication mentions and
+temporal information can further help clinicians better understand the
+patients' treatment history. To evaluate the performances of deep learning (DL)
+and large language models (LLMs) in medication extraction and temporal
+relations classification, we carry out an empirical investigation of
+\textbf{MedTem} project using several advanced learning structures including
+BiLSTM-CRF and CNN-BiLSTM for a clinical domain named entity recognition (NER),
+and BERT-CNN for temporal relation extraction (RE), in addition to the
+exploration of different word embedding techniques. Furthermore, we also
+designed a set of post-processing roles to generate structured output on
+medications and the temporal relation. Our experiments show that CNN-BiLSTM
+slightly wins the BiLSTM-CRF model on the i2b2-2009 clinical NER task yielding
+75.67, 77.83, and 78.17 for precision, recall, and F1 scores using Macro
+Average. BERT-CNN model also produced reasonable evaluation scores 64.48,
+67.17, and 65.03 for P/R/F1 using Macro Avg on the temporal relation extraction
+test set from i2b2-2012 challenges. Code and Tools from MedTem will be hosted
+at \url{https://github.com/HECTA-UoM/MedTem}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>working paper, 35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Think before you speak: Training Language Models With Pause Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sachin Goyal, Ziwei Ji, Ankit Singh Rawat, Aditya Krishna Menon, Sanjiv Kumar, Vaishnavh Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models generate responses by producing a series of tokens in
+immediate succession: the $(K+1)^{th}$ token is an outcome of manipulating $K$
+hidden vectors per layer, one vector per preceding token. What if instead we
+were to let the model manipulate say, $K+10$ hidden vectors, before it outputs
+the $(K+1)^{th}$ token? We operationalize this idea by performing training and
+inference on language models with a (learnable) $\textit{pause}$ token, a
+sequence of which is appended to the input prefix. We then delay extracting the
+model's outputs until the last pause token is seen, thereby allowing the model
+to process extra computation before committing to an answer. We empirically
+evaluate $\textit{pause-training}$ on decoder-only models of 1B and 130M
+parameters with causal pretraining on C4, and on downstream tasks covering
+reasoning, question-answering, general understanding and fact recall. Our main
+finding is that inference-time delays show gains when the model is both
+pre-trained and finetuned with delays. For the 1B model, we witness gains on 8
+of 9 tasks, most prominently, a gain of $18\%$ EM score on the QA task of
+SQuAD, $8\%$ on CommonSenseQA and $1\%$ accuracy on the reasoning task of
+GSM8k. Our work raises a range of conceptual and practical future research
+questions on making delayed next-token prediction a widely applicable new
+paradigm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Language Models be Instructed to Protect Personal Information? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Ethan Mendes, Sauvik Das, Wei Xu, Alan Ritter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal language models have proven transformative in numerous
+applications. However, these models have been shown to memorize and leak
+pre-training data, raising serious user privacy and information security
+concerns. While data leaks should be prevented, it is also crucial to examine
+the trade-off between the privacy protection and model utility of proposed
+approaches. In this paper, we introduce PrivQA -- a multimodal benchmark to
+assess this privacy/utility trade-off when a model is instructed to protect
+specific categories of personal information in a simulated scenario. We also
+propose a technique to iteratively self-moderate responses, which significantly
+improves privacy. However, through a series of red-teaming experiments, we find
+that adversaries can also easily circumvent these protections with simple
+jailbreaking methods through textual and/or image inputs. We believe PrivQA has
+the potential to support the development of new models with improved privacy
+protections, as well as the adversarial robustness of these protections. We
+release the entire PrivQA dataset at https://llm-access-control.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models Represent Space and Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wes Gurnee, Max Tegmark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The capabilities of large language models (LLMs) have sparked debate over
+whether such systems just learn an enormous collection of superficial
+statistics or a coherent model of the data generating process -- a world model.
+We find evidence for the latter by analyzing the learned representations of
+three spatial datasets (world, US, NYC places) and three temporal datasets
+(historical figures, artworks, news headlines) in the Llama-2 family of models.
+We discover that LLMs learn linear representations of space and time across
+multiple scales. These representations are robust to prompting variations and
+unified across different entity types (e.g. cities and landmarks). In addition,
+we identify individual ``space neurons'' and ``time neurons'' that reliably
+encode spatial and temporal coordinates. Our analysis demonstrates that modern
+LLMs acquire structured knowledge about fundamental dimensions such as space
+and time, supporting the view that they learn not merely superficial
+statistics, but literal world models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ask Again, Then Fail: Large Language Models' Vacillations in Judgement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Xie, Zengzhi Wang, Yi Feng, Rui Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of generative conversational large language models (LLMs)
+like ChatGPT, serving as virtual assistants in various fields, the stability
+and reliability of their responses have become crucial. However, during usage,
+it has been observed that these models tend to waver in their judgements when
+confronted with follow-up questions from users expressing skepticism or
+disagreement. In this work, we draw inspiration from questioning strategies in
+education and propose a \textsc{Follow-up Questioning Mechanism} along with two
+evaluation metrics to assess the judgement consistency of LLMs before and after
+exposure to disturbances. We evaluate the judgement consistency of ChatGPT,
+PaLM2-Bison, and Vicuna-13B under this mechanism across eight reasoning
+benchmarks. Empirical results show that even when the initial answers are
+correct, judgement consistency sharply decreases when LLMs face disturbances
+such as questioning, negation, or misleading. Additionally, we study these
+models' judgement consistency under various settings (sampling temperature and
+prompts) to validate this issue further, observing the impact of prompt tone
+and conducting an in-depth error analysis for deeper behavioral insights.
+Furthermore, we also explore several prompting methods to mitigate this issue
+and demonstrate their
+effectiveness\footnote{\url{https://github.com/NUSTM/LLMs-Waver-In-Judgements}}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic LLM-Agent Network: An LLM-agent Collaboration Framework with
+  Agent Team Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijun Liu, Yanzhe Zhang, Peng Li, Yang Liu, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language model (LLM) agents have been shown effective on a wide range
+of tasks, and by ensembling multiple LLM agents, their performances could be
+further improved. Existing approaches employ a fixed set of agents to interact
+with each other in a static architecture, which limits their generalizability
+to various tasks and requires strong human prior in designing these agents. In
+this work, we propose to construct a strategic team of agents communicating in
+a dynamic interaction architecture based on the task query. Specifically, we
+build a framework named Dynamic LLM-Agent Network ($\textbf{DyLAN}$) for
+LLM-agent collaboration on complicated tasks like reasoning and code
+generation. DyLAN enables agents to interact for multiple rounds in a dynamic
+architecture with inference-time agent selection and an early-stopping
+mechanism to improve performance and efficiency. We further design an automatic
+agent team optimization algorithm based on an unsupervised metric termed
+$\textit{Agent Importance Score}$, enabling the selection of best agents based
+on the contribution each agent makes. Empirically, we demonstrate that DyLAN
+performs well in both reasoning and code generation tasks with reasonable
+computational cost. DyLAN achieves 13.0% and 13.3% improvement on MATH and
+HumanEval, respectively, compared to a single execution on GPT-35-turbo. On
+specific subjects of MMLU, agent team optimization in DyLAN increases accuracy
+by up to 25.0%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, under review. 21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Editing Personality for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Mao, Ningyu Zhang, Xiaohan Wang, Mengru Wang, Yunzhi Yao, Yong Jiang, Pengjun Xie, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an innovative task focused on editing the personality
+traits of Large Language Models (LLMs). This task seeks to adjust the models'
+responses to opinion-related questions on specified topics since an
+individual's personality often manifests in the form of their expressed
+opinions, thereby showcasing different personality traits. Specifically, we
+construct a new benchmark dataset PersonalityEdit to address this task. Drawing
+on the theory in Social Psychology, we isolate three representative traits,
+namely Neuroticism, Extraversion, and Agreeableness, as the foundation for our
+benchmark. We then gather data using GPT-4, generating responses that not only
+align with a specified topic but also embody the targeted personality trait. We
+conduct comprehensive experiments involving various baselines and discuss the
+representation of personality behavior in LLMs. Our intriguing findings uncover
+potential challenges of the proposed task, illustrating several remaining
+issues. We anticipate that our work can provide the NLP community with
+insights. Code and datasets will be released at
+https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Meet Knowledge Graphs to Answer Factoid Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Salnikov, Hai Le, Prateek Rajput, Irina Nikishina, Pavel Braslavski, Valentin Malykh, Alexander Panchenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, it has been shown that the incorporation of structured knowledge
+into Large Language Models significantly improves the results for a variety of
+NLP tasks. In this paper, we propose a method for exploring pre-trained
+Text-to-Text Language Models enriched with additional information from
+Knowledge Graphs for answering factoid questions. More specifically, we propose
+an algorithm for subgraphs extraction from a Knowledge Graph based on question
+entities and answer candidates. Then, we procure easily interpreted information
+with Transformer-based models through the linearization of the extracted
+subgraphs. Final re-ranking of the answer candidates with the extracted
+information boosts Hits@1 scores of the pre-trained text-to-text language
+models by 4-6%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhoubo Li, Ningyu Zhang, Yunzhi Yao, Mengru Wang, Xi Chen, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the cost associated with fine-tuning Large Language Models (LLMs)
+continues to rise, recent research efforts have pivoted towards developing
+methodologies to edit implicit knowledge embedded within LLMs. Yet, there's
+still a dark cloud lingering overhead -- will knowledge editing trigger
+butterfly effect? since it is still unclear whether knowledge editing might
+introduce side effects that pose potential risks or not. This paper pioneers
+the investigation into the potential pitfalls associated with knowledge editing
+for LLMs. To achieve this, we introduce new benchmark datasets and propose
+innovative evaluation metrics. Our results underline two pivotal concerns: (1)
+Knowledge Conflict: Editing groups of facts that logically clash can magnify
+the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)
+Knowledge Distortion: Altering parameters with the aim of editing factual
+knowledge can irrevocably warp the innate knowledge structure of LLMs.
+Experimental results vividly demonstrate that knowledge editing might
+inadvertently cast a shadow of unintended consequences on LLMs, which warrant
+attention and efforts for future works. Code will be released at
+https://github.com/zjunlp/PitfallsKnowledgeEditing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Collaboration Mechanisms for LLM Agents: A Social Psychology
+  View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintian Zhang, Xin Xu, Shumin Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Natural Language Processing (NLP) systems are increasingly employed in
+intricate social environments, a pressing query emerges: Can these NLP systems
+mirror human-esque collaborative intelligence, in a multi-agent society
+consisting of multiple large language models (LLMs)? This paper probes the
+collaboration mechanisms among contemporary NLP systems by melding practical
+experiments with theoretical insights. We fabricate four unique `societies'
+comprised of LLM agents, where each agent is characterized by a specific
+`trait' (easy-going or overconfident) and engages in collaboration with a
+distinct `thinking pattern' (debate or reflection). Evaluating these
+multi-agent societies on three benchmark datasets, we discern that LLM agents
+navigate tasks by leveraging diverse social behaviors, from active debates to
+introspective reflections. Notably, certain collaborative strategies only
+optimize efficiency (using fewer API tokens), but also outshine previous
+top-tier approaches. Moreover, our results further illustrate that LLM agents
+manifest human-like social behaviors, such as conformity or majority rule,
+mirroring foundational Social Psychology theories. In conclusion, we integrate
+insights from Social Psychology to contextualize the collaboration of LLM
+agents, inspiring further investigations into the collaboration mechanism for
+LLMs. We commit to sharing our code and datasets (already submitted in
+supplementary materials), hoping to catalyze further research in this promising
+avenue (All code and data are available at
+\url{https://github.com/zjunlp/MachineSoM}.).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TWIZ: The Wizard of Multimodal Conversational-Stimulus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Ferreira, Diogo Tavares, Diogo Silva, Rodrigo Valério, João Bordalo, Inês Simões, Vasco Ramos, David Semedo, João Magalhães
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this report, we describe the vision, challenges, and scientific
+contributions of the Task Wizard team, TWIZ, in the Alexa Prize TaskBot
+Challenge 2022. Our vision, is to build TWIZ bot as an helpful, multimodal,
+knowledgeable, and engaging assistant that can guide users towards the
+successful completion of complex manual tasks. To achieve this, we focus our
+efforts on three main research questions: (1) Humanly-Shaped Conversations, by
+providing information in a knowledgeable way; (2) Multimodal Stimulus, making
+use of various modalities including voice, images, and videos; and (3)
+Zero-shot Conversational Flows, to improve the robustness of the interaction to
+unseen scenarios. TWIZ is an assistant capable of supporting a wide range of
+tasks, with several innovative features such as creative cooking, video
+navigation through voice, and the robust TWIZ-LLM, a Large Language Model
+trained for dialoguing about complex manual tasks. Given ratings and feedback
+provided by users, we observed that TWIZ bot is an effective and robust system,
+capable of guiding users through tasks while providing several multimodal
+stimuli.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instance Needs More Care: Rewriting <span class="highlight-title">Prompt</span>s for Instances Yields Better
+  Zero-Shot Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurabh Srivastava, Chengyue Huang, Weiguo Fan, Ziyu Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enabling large language models (LLMs) to perform tasks in zero-shot has been
+an appealing goal owing to its labor-saving (i.e., requiring no task-specific
+annotations); as such, zero-shot prompting approaches also enjoy better task
+generalizability. To improve LLMs' zero-shot performance, prior work has
+focused on devising more effective task instructions (e.g., ``let's think step
+by step'' ). However, we argue that, in order for an LLM to solve them
+correctly in zero-shot, individual test instances need more carefully designed
+and customized instructions. To this end, we propose PRoMPTd, an approach that
+rewrites the task prompt for each individual test input to be more specific,
+unambiguous, and complete, so as to provide better guidance to the task LLM. We
+evaluated PRoMPTd on eight datasets covering tasks including arithmetics,
+logical reasoning, and code generation, using GPT-4 as the task LLM. Notably,
+\algoname achieves an absolute improvement of around 10\% on the complex MATH
+dataset and 5\% on the code generation task on HumanEval, outperforming
+conventional zero-shot methods. In addition, we also showed that the rewritten
+prompt can provide better interpretability of how the LLM resolves each test
+instance, which can potentially be leveraged as a defense mechanism against
+adversarial prompting. The source code and dataset can be obtained from
+https://github.com/salokr/PRoMPTd
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards End-to-End Embodied Decision Making via Multi-modal Large
+  Language Model: Explorations with <span class="highlight-title">GPT</span>4-Vision and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Yichi Zhang, Shuhuai Ren, Haozhe Zhao, Zefan Cai, Yuchi Wang, Tianyu Liu, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore the potential of Multimodal Large Language Models
+(MLLMs) in improving embodied decision-making processes for agents. While Large
+Language Models (LLMs) have been widely used due to their advanced reasoning
+skills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual
+understanding and reasoning capabilities. We investigate whether
+state-of-the-art MLLMs can handle embodied decision-making in an end-to-end
+manner and whether collaborations between LLMs and MLLMs can enhance
+decision-making. To address these questions, we introduce a new benchmark
+called PCA-EVAL, which evaluates embodied decision-making from the perspectives
+of Perception, Cognition, and Action. Additionally, we propose HOLMES, a
+multi-agent cooperation framework that allows LLMs to leverage MLLMs and APIs
+to gather multimodal information for informed decision-making. We compare
+end-to-end embodied decision-making and HOLMES on our benchmark and find that
+the GPT4-Vision model demonstrates strong end-to-end embodied decision-making
+abilities, outperforming GPT4-HOLMES in terms of average decision accuracy
+(+3%). However, this performance is exclusive to the latest GPT4-Vision model,
+surpassing the open-source state-of-the-art MLLM by 26%. Our results indicate
+that powerful MLLMs like GPT4-Vision hold promise for decision-making in
+embodied agents, offering new avenues for MLLM research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling Topic-Focus Articulation in Meaning-to-Text Generation using
+  Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunliu Wang, Rik van Noord, Johan Bos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A bare meaning representation can be expressed in various ways using natural
+language, depending on how the information is structured on the surface level.
+We are interested in finding ways to control topic-focus articulation when
+generating text from meaning. We focus on distinguishing active and passive
+voice for sentences with transitive verbs. The idea is to add pragmatic
+information such as topic to the meaning representation, thereby forcing either
+active or passive voice when given to a natural language generation system. We
+use graph neural models because there is no explicit information about word
+order in a meaning represented by a graph. We try three different methods for
+topic-focus articulation (TFA) employing graph neural models for a
+meaning-to-text generation task. We propose a novel encoding strategy about
+node aggregation in graph neural models, which instead of traditional encoding
+by aggregating adjacent node information, learns node representations by using
+depth-first search. The results show our approach can get competitive
+performance with state-of-art graph models on general text generation, and lead
+to significant improvements on the task of active-passive conversion compared
+to traditional adjacency-based aggregation strategies. Different types of TFA
+can have a huge impact on the performance of the graph models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tuning Large language model for End-to-end Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhang, Nianwen Si, Yaqi Chen, Wenlin Zhang, Xukui Yang, Dan Qu, Xiaolin Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of large language models (LLMs), multimodal models based
+on LLMs have demonstrated significant potential. Models such as LLaSM, X-LLM,
+and SpeechGPT exhibit an impressive ability to comprehend and generate human
+instructions. However, their performance often falters when faced with complex
+tasks like end-to-end speech translation (E2E-ST), a cross-language and
+cross-modal translation task. In comparison to single-modal models, multimodal
+models lag behind in these scenarios. This paper introduces LST, a Large
+multimodal model designed to excel at the E2E-ST task. LST consists of a speech
+frontend, an adapter, and a LLM backend. The training of LST consists of two
+stages: (1) Modality adjustment, where the adapter is tuned to align speech
+representation with text embedding space, and (2) Downstream task fine-tuning,
+where both the adapter and LLM model are trained to optimize performance on the
+E2EST task. Experimental results on the MuST-C speech translation benchmark
+demonstrate that LST-13B achieves BLEU scores of 30.39/41.55/35.33 on
+En-De/En-Fr/En-Es language pairs, surpassing previous models and establishing a
+new state-of-the-art. Additionally, we conduct an in-depth analysis of
+single-modal model selection and the impact of training strategies, which lays
+the foundation for future research. We will open up our code and models after
+review.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jury: A Comprehensive Evaluation Toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devrim Cavusoglu, Ulas Sert, Secil Sen, Sinan Altinuc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluation plays a critical role in deep learning as a fundamental block of
+any prediction-based system. However, the vast number of Natural Language
+Processing (NLP) tasks and the development of various metrics have led to
+challenges in evaluating different systems with different metrics. To address
+these challenges, we introduce jury, a toolkit that provides a unified
+evaluation framework with standardized structures for performing evaluation
+across different tasks and metrics. The objective of jury is to standardize and
+improve metric evaluation for all systems and aid the community in overcoming
+the challenges in evaluation. Since its open-source release, jury has reached a
+wide audience and is available at https://github.com/obss/jury.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ocean<span class="highlight-title">GPT</span>: A Large Language Model for Ocean Science Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Bi, Ningyu Zhang, Yida Xue, Yixin Ou, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocean science, which delves into the oceans that are reservoirs of life and
+biodiversity, is of great significance given that oceans cover over 70% of our
+planet's surface. Recently, advances in Large Language Models (LLMs) have
+transformed the paradigm in science. Despite the success in other domains,
+current LLMs often fall short in catering to the needs of domain experts like
+oceanographers, and the potential of LLMs for ocean science is under-explored.
+The intrinsic reason may be the immense and intricate nature of ocean data as
+well as the necessity for higher granularity and richness in knowledge. To
+alleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean
+domain, which is expert in various ocean science tasks. We propose DoInstruct,
+a novel framework to automatically obtain a large volume of ocean domain
+instruction data, which generates instructions based on multi-agent
+collaboration. Additionally, we construct the first oceanography benchmark,
+OceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though
+comprehensive experiments, OceanGPT not only shows a higher level of knowledge
+expertise for oceans science tasks but also gains preliminary embodied
+intelligence capabilities in ocean technology. Codes, data and checkpoints will
+soon be available at https://github.com/zjunlp/KnowLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Project Website:
+  https://zjunlp.github.io/project/OceanGPT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fill in the Blank: Exploring and Enhancing LLM Capabilities for Backward
+  Reasoning in Math Word Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Deb, Neeva Oza, Sarthak Singla, Dinesh Khandelwal, Dinesh Garg, Parag Singla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While forward reasoning (i.e. find the answer given the question) has been
+explored extensively in the recent literature, backward reasoning is relatively
+unexplored. We examine the backward reasoning capabilities of LLMs on Math Word
+Problems (MWPs): given a mathematical question and its answer, with some
+details omitted from the question, can LLMs effectively retrieve the missing
+information?
+  In this paper, we formally define the backward reasoning task on math word
+problems and modify three datasets to evaluate this task: GSM8k, SVAMP and
+MultiArith. Our findings show a significant drop in the accuracy of models on
+backward reasoning compared to forward reasoning across four SOTA LLMs (GPT4,
+GPT3.5, PaLM-2, and LLaMa-2). Utilizing the specific format of this task, we
+propose three novel techniques that improve performance: Rephrase reformulates
+the given problem into a forward reasoning problem, PAL-Tools combines the idea
+of Program-Aided LLMs to produce a set of equations that can be solved by an
+external solver, and Check your Work exploits the availability of natural
+verifier of high accuracy in the forward direction, interleaving solving and
+verification steps. Finally, realizing that each of our base methods correctly
+solves a different set of problems, we propose a novel Bayesian formulation for
+creating an ensemble over these base methods aided by a verifier to further
+boost the accuracy by a significant margin. Extensive experimentation
+demonstrates that our techniques successively improve the performance of LLMs
+on the backward reasoning task, with the final ensemble-based method resulting
+in a substantial performance gain compared to the raw LLMs with standard
+prompting techniques such as chain-of-thought.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models as Knowledge Bases for Visual Word Sense Disambiguation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasia Kritharoula, Maria Lymperaiou, Giorgos Stamou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Word Sense Disambiguation (VWSD) is a novel challenging task that lies
+between linguistic sense disambiguation and fine-grained multimodal retrieval.
+The recent advancements in the development of visiolinguistic (VL) transformers
+suggest some off-the-self implementations with encouraging results, which
+however we argue that can be further improved. To this end, we propose some
+knowledge-enhancement techniques towards improving the retrieval performance of
+VL transformers via the usage of Large Language Models (LLMs) as Knowledge
+Bases. More specifically, knowledge stored in LLMs is retrieved with the help
+of appropriate prompts in a zero-shot manner, achieving performance
+advancements. Moreover, we convert VWSD to a purely textual question-answering
+(QA) problem by considering generated image captions as multiple-choice
+candidate answers. Zero-shot and few-shot prompting strategies are leveraged to
+explore the potential of such a transformation, while Chain-of-Thought (CoT)
+prompting in the zero-shot setting is able to reveal the internal reasoning
+steps an LLM follows to select the appropriate candidate. In total, our
+presented approach is the first one to analyze the merits of exploiting
+knowledge stored in LLMs in different ways to solve WVSD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Driving with LLMs: Fusing Object-Level Vector Modality for Explainable
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Chen, Oleg Sinavski, Jan Hünermann, Alice Karnsund, Andrew James Willmott, Danny Birch, Daniel Maund, Jamie Shotton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promise in the autonomous driving
+sector, particularly in generalization and interpretability. We introduce a
+unique object-level multimodal LLM architecture that merges vectorized numeric
+modalities with a pre-trained LLM to improve context understanding in driving
+situations. We also present a new dataset of 160k QA pairs derived from 10k
+driving scenarios, paired with high quality control commands collected with RL
+agent and question answer pairs generated by teacher LLM (GPT-3.5). A distinct
+pretraining strategy is devised to align numeric vector modalities with static
+LLM representations using vector captioning language data. We also introduce an
+evaluation metric for Driving QA and demonstrate our LLM-driver's proficiency
+in interpreting driving scenarios, answering questions, and decision-making.
+Our findings highlight the potential of LLM-based driving action generation in
+comparison to traditional behavioral cloning. We make our benchmark, datasets,
+and model available for further exploration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of
+  Text-To-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mor Ventura, Eyal Ben-David, Anna Korhonen, Roi Reichart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-To-Image (TTI) models, exemplified by DALL-E and StableDiffusion, have
+recently gained prominence for their remarkable zero-shot capabilities in
+generating images guided by textual prompts. Language, as a conduit of culture,
+plays a pivotal role in these models' multilingual capabilities, which in turn
+shape their cultural agency. In this study, we explore the cultural perception
+embedded in TTI models by characterizing culture across three hierarchical
+tiers: cultural dimensions, cultural domains, and cultural concepts. We propose
+a comprehensive suite of evaluation techniques, including intrinsic evaluations
+using the CLIP space, extrinsic evaluations with a Visual-Question-Answer (VQA)
+model, and human assessments, to discern TTI cultural perceptions. To
+facilitate our research, we introduce the CulText2I dataset, derived from four
+diverse TTI models and spanning ten languages. Our experiments reveal insights
+into these models' cultural awareness, cultural distinctions, and the unlocking
+of cultural features, releasing the potential for cross-cultural applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Evaluation Framework: Best Practices for Human Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iva Bojic, Jessica Chen, Si Yuan Chang, Qi Chwen Ong, Shafiq Joty, Josip Car
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human evaluation plays a crucial role in Natural Language Processing (NLP) as
+it assesses the quality and relevance of developed systems, thereby
+facilitating their enhancement. However, the absence of widely accepted human
+evaluation metrics in NLP hampers fair comparisons among different systems and
+the establishment of universal assessment standards. Through an extensive
+analysis of existing literature on human evaluation metrics, we identified
+several gaps in NLP evaluation methodologies. These gaps served as motivation
+for developing our own hierarchical evaluation framework. The proposed
+framework offers notable advantages, particularly in providing a more
+comprehensive representation of the NLP system's performance. We applied this
+framework to evaluate the developed Machine Reading Comprehension system, which
+was utilized within a human-AI symbiosis model. The results highlighted the
+associations between the quality of inputs and outputs, underscoring the
+necessity to evaluate both components rather than solely focusing on outputs.
+In future work, we will investigate the potential time-saving benefits of our
+proposed framework for evaluators assessing NLP systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ring Attention with Blockwise <span class="highlight-title">Transformer</span>s for Near-Infinite Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Matei Zaharia, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as the architecture of choice for many
+state-of-the-art AI models, showcasing exceptional performance across a wide
+range of AI applications. However, the memory demands imposed by Transformers
+limit their ability to handle long sequences, thereby creating challenges for
+tasks involving extended sequences or long-term dependencies. We present a
+distinct approach, Ring Attention, which leverages blockwise computation of
+self-attention to distribute long sequences across multiple devices while
+concurrently overlapping the communication of key-value blocks with the
+computation of blockwise attention. By processing longer input sequences while
+maintaining memory efficiency, Ring Attention enables training and inference of
+sequences that are device count times longer than those of prior
+memory-efficient Transformers, effectively eliminating the memory constraints
+imposed by individual devices. Extensive experiments on language modeling tasks
+demonstrate the effectiveness of Ring Attention in allowing large sequence
+input size and improving performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective and Parameter-Efficient Reusing Fine-Tuned Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisen Jiang, Baijiong Lin, Han Shi, Yu Zhang, and Zhenguo Li, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many pre-trained large-scale models provided online have become highly
+effective in transferring to downstream tasks. At the same time, various
+task-specific models fine-tuned on these pre-trained models are available
+online for public use. In practice, as collecting task-specific data is
+labor-intensive and fine-tuning the large pre-trained models is computationally
+expensive, one can reuse task-specific finetuned models to deal with downstream
+tasks. However, using a model per task causes a heavy burden on storage and
+serving. Recently, many training-free and parameter-efficient methods have been
+proposed for reusing multiple fine-tuned task-specific models into a single
+multi-task model. However, these methods exhibit a large accuracy gap compared
+with using a fine-tuned model per task. In this paper, we propose
+Parameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing
+Fully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task
+vector into a merged model by magnitude pruning. For reusing LoRA fine-tuned
+models, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA
+matrix by singular value decomposition. Both PERUFFT and PERU-LoRA are
+training-free. Extensive experiments conducted on computer vision and natural
+language process tasks demonstrate the effectiveness and parameter-efficiency
+of the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform
+existing reusing model methods by a large margin and achieve comparable
+performance to using a fine-tuned model per task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-tuned vs. <span class="highlight-title">Prompt</span>-tuned Supervised Representations: Which Better
+  Account for Brain Language Representations? <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Sun, Marie-Francine Moens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To decipher the algorithm underlying the human brain's language
+representation, previous work probed brain responses to language input with
+pre-trained artificial neural network (ANN) models fine-tuned on NLU tasks.
+However, full fine-tuning generally updates the entire parametric space and
+distorts pre-trained features, cognitively inconsistent with the brain's robust
+multi-task learning ability. Prompt-tuning, in contrast, protects pre-trained
+weights and learns task-specific embeddings to fit a task. Could prompt-tuning
+generate representations that better account for the brain's language
+representations than fine-tuning? If so, what kind of NLU task leads a
+pre-trained model to better decode the information represented in the human
+brain? We investigate these questions by comparing prompt-tuned and fine-tuned
+representations in neural decoding, that is predicting the linguistic stimulus
+from the brain activities evoked by the stimulus. We find that on none of the
+10 NLU tasks, full fine-tuning significantly outperforms prompt-tuning in
+neural decoding, implicating that a more brain-consistent tuning method yields
+representations that better correlate with brain data. Moreover, we identify
+that tasks dealing with fine-grained concept meaning yield representations that
+better decode brain activation patterns than other tasks, especially the
+syntactic chunking task. This indicates that our brain encodes more
+fine-grained concept information than shallow syntactic information when
+representing languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking and Improving Generator-Validator Consistency of Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Lisa Li, Vaishnavi Shrivastava, Siyan Li, Tatsunori Hashimoto, Percy Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As of September 2023, ChatGPT correctly answers "what is 7+8" with 15, but
+when asked "7+8=15, True or False" it responds with "False". This inconsistency
+between generating and validating an answer is prevalent in language models
+(LMs) and erodes trust. In this paper, we propose a framework for measuring the
+consistency between generation and validation (which we call
+generator-validator consistency, or GV-consistency), finding that even GPT-4, a
+state-of-the-art LM, is GV-consistent only 76% of the time. To improve the
+consistency of LMs, we propose to finetune on the filtered generator and
+validator responses that are GV-consistent, and call this approach consistency
+fine-tuning. We find that this approach improves GV-consistency of Alpaca-30B
+from 60% to 93%, and the improvement extrapolates to unseen tasks and domains
+(e.g., GV-consistency for positive style transfers extrapolates to unseen
+styles like humor). In addition to improving consistency, consistency
+fine-tuning improves both generator quality and validator accuracy without
+using any labeled data. Evaluated across 6 tasks, including math questions,
+knowledge-intensive QA, and instruction following, our method improves the
+generator quality by 16% and the validator accuracy by 6.3% across all tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Refinement of Buildings' Segmentation Models using SAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Mayladan, Hasan Nasrallah, Hasan Moughnieh, Mustafa Shukor, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have excelled in various tasks but are often evaluated on
+general benchmarks. The adaptation of these models for specific domains, such
+as remote sensing imagery, remains an underexplored area. In remote sensing,
+precise building instance segmentation is vital for applications like urban
+planning. While Convolutional Neural Networks (CNNs) perform well, their
+generalization can be limited. For this aim, we present a novel approach to
+adapt foundation models to address existing models' generalization dropback.
+Among several models, our focus centers on the Segment Anything Model (SAM), a
+potent foundation model renowned for its prowess in class-agnostic image
+segmentation capabilities. We start by identifying the limitations of SAM,
+revealing its suboptimal performance when applied to remote sensing imagery.
+Moreover, SAM does not offer recognition abilities and thus fails to classify
+and tag localized objects. To address these limitations, we introduce different
+prompting strategies, including integrating a pre-trained CNN as a prompt
+generator. This novel approach augments SAM with recognition abilities, a first
+of its kind. We evaluated our method on three remote sensing datasets,
+including the WHU Buildings dataset, the Massachusetts Buildings dataset, and
+the AICrowd Mapping Challenge. For out-of-distribution performance on the WHU
+dataset, we achieve a 5.47% increase in IoU and a 4.81% improvement in
+F1-score. For in-distribution performance on the WHU dataset, we observe a
+2.72% and 1.58% increase in True-Positive-IoU and True-Positive-F1 score,
+respectively. We intend to release our code repository, hoping to inspire
+further exploration of foundation models for domain-specific tasks within the
+remote sensing community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Preserving Phonemic Distinctions for Ordinal Regression: A Novel Loss
+  Function for Automatic Pronunciation Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bi-Cheng Yan, Hsin-Wei Wang, Yi-Cheng Wang, Jiun-Ting Li, Chi-Han Lin, Berlin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic pronunciation assessment (APA) manages to quantify the
+pronunciation proficiency of a second language (L2) learner in a language.
+Prevailing approaches to APA normally leverage neural models trained with a
+regression loss function, such as the mean-squared error (MSE) loss, for
+proficiency level prediction. Despite most regression models can effectively
+capture the ordinality of proficiency levels in the feature space, they are
+confronted with a primary obstacle that different phoneme categories with the
+same proficiency level are inevitably forced to be close to each other,
+retaining less phoneme-discriminative information. On account of this, we
+devise a phonemic contrast ordinal (PCO) loss for training regression-based APA
+models, which aims to preserve better phonemic distinctions between phoneme
+categories meanwhile considering ordinal relationships of the regression target
+output. Specifically, we introduce a phoneme-distinct regularizer into the MSE
+loss, which encourages feature representations of different phoneme categories
+to be far apart while simultaneously pulling closer the representations
+belonging to the same phoneme category by means of weighted distances. An
+extensive set of experiments carried out on the speechocean762 benchmark
+dataset suggest the feasibility and effectiveness of our model in relation to
+some existing state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Karim Gizzini, Mustafa Shukor, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current AI-based methods do not provide comprehensible physical
+interpretations of the utilized data, extracted features, and
+predictions/inference operations. As a result, deep learning models trained
+using high-resolution satellite imagery lack transparency and explainability
+and can be merely seen as a black box, which limits their wide-level adoption.
+Experts need help understanding the complex behavior of AI models and the
+underlying decision-making process. The explainable artificial intelligence
+(XAI) field is an emerging field providing means for robust, practical, and
+trustworthy deployment of AI models. Several XAI techniques have been proposed
+for image classification tasks, whereas the interpretation of image
+segmentation remains largely unexplored. This paper offers to bridge this gap
+by adapting the recent XAI classification algorithms and making them usable for
+muti-class image segmentation, where we mainly focus on buildings' segmentation
+from high-resolution satellite images. To benchmark and compare the performance
+of the proposed approaches, we introduce a new XAI evaluation methodology and
+metric based on "Entropy" to measure the model uncertainty. Conventional XAI
+evaluation methods rely mainly on feeding area-of-interest regions from the
+image back to the pre-trained (utility) model and then calculating the average
+change in the probability of the target class. Those evaluation metrics lack
+the needed robustness, and we show that using Entropy to monitor the model
+uncertainty in segmenting the pixels within the target class is more suitable.
+We hope this work will pave the way for additional XAI research for image
+segmentation and applications in the remote sensing discipline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trainable Noise Model as an XAI evaluation method: application on Sobol
+  for remote sensing image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Shreim, Abdul Karim Gizzini, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  eXplainable Artificial Intelligence (XAI) has emerged as an essential
+requirement when dealing with mission-critical applications, ensuring
+transparency and interpretability of the employed black box AI models. The
+significance of XAI spans various domains, from healthcare to finance, where
+understanding the decision-making process of deep learning algorithms is
+essential. Most AI-based computer vision models are often black boxes; hence,
+providing explainability of deep neural networks in image processing is crucial
+for their wide adoption and deployment in medical image analysis, autonomous
+driving, and remote sensing applications. Recently, several XAI methods for
+image classification tasks have been introduced. On the contrary, image
+segmentation has received comparatively less attention in the context of
+explainability, although it is a fundamental task in computer vision
+applications, especially in remote sensing. Only some research proposes
+gradient-based XAI algorithms for image segmentation. This paper adapts the
+recent gradient-free Sobol XAI method for semantic segmentation. To measure the
+performance of the Sobol method for segmentation, we propose a quantitative XAI
+evaluation method based on a learnable noise model. The main objective of this
+model is to induce noise on the explanation maps, where higher induced noise
+signifies low accuracy and vice versa. A benchmark analysis is conducted to
+evaluate and compare performance of three XAI methods, including Seg-Grad-CAM,
+Seg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation
+technique. This constitutes the first attempt to run and evaluate XAI methods
+using high-resolution satellite images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Hasan Zahweh, Hasan Nasrallah, Mustafa Shukor, Ghaleb Faour, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced
+significant growth and have been extensively employed to adapt large vision and
+language models to various domains, enabling satisfactory model performance
+with minimal computational needs. Despite these advances, more research has yet
+to delve into potential PEFT applications in real-life scenarios, particularly
+in the critical domains of remote sensing and crop monitoring. The diversity of
+climates across different regions and the need for comprehensive large-scale
+datasets have posed significant obstacles to accurately identify crop types
+across varying geographic locations and changing growing seasons. This study
+seeks to bridge this gap by comprehensively exploring the feasibility of
+cross-area and cross-year out-of-distribution generalization using the
+State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to
+explore PEFT approaches for crop monitoring. Specifically, we focus on adapting
+the SOTA TSViT model to address winter wheat field segmentation, a critical
+task for crop monitoring and food security. This adaptation process involves
+integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and
+prompt tuning. Using PEFT techniques, we achieved notable results comparable to
+those achieved using full fine-tuning methods while training only a mere 0.7%
+parameters of the whole TSViT architecture. The in-house labeled data-set,
+referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated
+polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over
+five consecutive years. Using Sentinel-2 images, our model achieved a 84%
+F1-score. We intend to publicly release the Lebanese winter wheat data set,
+code repository, and model weights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01801v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01801v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suyu Ge, Yunan Zhang, Liyuan Liu, Minjia Zhang, Jiawei Han, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce adaptive KV cache compression, a plug-and-play
+method that reduces the memory footprint of generative inference for Large
+Language Models (LLMs). Different from the conventional KV cache that retains
+key and value vectors for all context tokens, we conduct targeted profiling to
+discern the intrinsic structure of attention modules. Based on the recognized
+structure, we then construct the KV cache in an adaptive manner: evicting
+long-range contexts on attention heads emphasizing local contexts, discarding
+non-special tokens on attention heads centered on special tokens, and only
+employing the standard KV cache for attention heads that broadly attend to all
+tokens. Moreover, with the lightweight attention profiling used to guide the
+construction of the adaptive KV cache, FastGen can be deployed without
+resource-intensive fine-tuning or re-training. In our experiments across
+various asks, FastGen demonstrates substantial reduction on GPU memory
+consumption with negligible generation quality loss. We will release our code
+and the compatible CUDA kernel for reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review; To be updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Cannot Self-Correct Reasoning Yet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, Xinying Song, Denny Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as a groundbreaking technology with
+their unparalleled text generation capabilities across various applications.
+Nevertheless, concerns persist regarding the accuracy and appropriateness of
+their generated content. A contemporary methodology, self-correction, has been
+proposed as a remedy to these issues. Building upon this premise, this paper
+critically examines the role and efficacy of self-correction within LLMs,
+shedding light on its true potential and limitations. Central to our
+investigation is the notion of intrinsic self-correction, whereby an LLM
+attempts to correct its initial responses based solely on its inherent
+capabilities, without the crutch of external feedback. In the context of
+reasoning, our research indicates that LLMs struggle to self-correct their
+responses without external feedback, and at times, their performance might even
+degrade post self-correction. Drawing from these insights, we offer suggestions
+for future research and practical applications in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can large language models provide useful feedback on research papers? A
+  large-scale empirical analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixin Liang, Yuhui Zhang, Hancheng Cao, Binglu Wang, Daisy Ding, Xinyu Yang, Kailas Vodrahalli, Siyu He, Daniel Smith, Yian Yin, Daniel McFarland, James Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expert feedback lays the foundation of rigorous research. However, the rapid
+growth of scholarly production and intricate knowledge specialization challenge
+the conventional scientific feedback mechanisms. High-quality peer reviews are
+increasingly difficult to obtain. Researchers who are more junior or from
+under-resourced settings have especially hard times getting timely feedback.
+With the breakthrough of large language models (LLM) such as GPT-4, there is
+growing interest in using LLMs to generate scientific feedback on research
+manuscripts. However, the utility of LLM-generated feedback has not been
+systematically studied. To address this gap, we created an automated pipeline
+using GPT-4 to provide comments on the full PDFs of scientific papers. We
+evaluated the quality of GPT-4's feedback through two large-scale studies. We
+first quantitatively compared GPT-4's generated feedback with human peer
+reviewer feedback in 15 Nature family journals (3,096 papers in total) and the
+ICLR machine learning conference (1,709 papers). The overlap in the points
+raised by GPT-4 and by human reviewers (average overlap 30.85% for Nature
+journals, 39.23% for ICLR) is comparable to the overlap between two human
+reviewers (average overlap 28.58% for Nature journals, 35.25% for ICLR). The
+overlap between GPT-4 and human reviewers is larger for the weaker papers. We
+then conducted a prospective user study with 308 researchers from 110 US
+institutions in the field of AI and computational biology to understand how
+researchers perceive feedback generated by our GPT-4 system on their own
+papers. Overall, more than half (57.4%) of the users found GPT-4 generated
+feedback helpful/very helpful and 82.4% found it more beneficial than feedback
+from at least some human reviewers. While our findings show that LLM-generated
+feedback can help researchers, we also identify several limitations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEA: Sparse Linear Attention with Estimated Attention Mask 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heejun Lee, Jina Kim, Jeffrey Willette, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transformer architecture has made breakthroughs in recent years on tasks
+which require modeling pairwise relationships between sequential elements, as
+is the case in natural language understanding. However, transformers struggle
+with long sequences due to the quadratic complexity of the attention operation,
+and previous research has aimed to lower the complexity by sparsifying or
+linearly approximating the attention matrix. Yet, these approaches cannot
+straightforwardly distill knowledge from a teacher's attention matrix, and
+often require complete retraining from scratch. Furthermore, previous sparse
+and linear approaches may also lose interpretability if they do not produce
+full quadratic attention matrices. To address these challenges, we propose SEA:
+Sparse linear attention with an Estimated Attention mask. SEA estimates the
+attention matrix with linear complexity via kernel-based linear attention, then
+creates a sparse approximation to the full attention matrix with a top-k
+selection to perform a sparse attention operation. For language modeling tasks
+(Wikitext2), previous linear and sparse attention methods show a roughly
+two-fold worse perplexity scores over the quadratic OPT-125M baseline, while
+SEA achieves an even better perplexity than OPT-125M, using roughly half as
+much memory as OPT-125M. Moreover, SEA maintains an interpretable attention
+matrix and can utilize knowledge distillation to lower the complexity of
+existing pretrained transformers. We believe that our work will have a large
+practical impact, as it opens the possibility of running large transformers on
+resource-limited devices with less memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 main pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stack Attention: Improving the Ability of <span class="highlight-title">Transformer</span>s to Model
+  Hierarchical Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian DuSell, David Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention, specifically scaled dot-product attention, has proven effective
+for natural language, but it does not have a mechanism for handling
+hierarchical patterns of arbitrary nesting depth, which limits its ability to
+recognize certain syntactic structures. To address this shortcoming, we propose
+stack attention: an attention operator that incorporates stacks, inspired by
+their theoretical connections to context-free languages (CFLs). We show that
+stack attention is analogous to standard attention, but with a latent model of
+syntax that requires no syntactic supervision. We propose two variants: one
+related to deterministic pushdown automata (PDAs) and one based on
+nondeterministic PDAs, which allows transformers to recognize arbitrary CFLs.
+We show that transformers with stack attention are very effective at learning
+CFLs that standard transformers struggle on, achieving strong results on a CFL
+with theoretically maximal parsing difficulty. We also show that stack
+attention is more effective at natural language modeling under a constrained
+parameter budget, and we include results on machine translation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nugget: Neural Agglomerative Embeddings of Text <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanghui Qin, Benjamin Van Durme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embedding text sequences is a widespread requirement in modern language
+understanding. Existing approaches focus largely on constant-size
+representations. This is problematic, as the amount of information contained in
+text often varies with the length of the input. We propose a solution called
+Nugget, which encodes language into a representation based on a dynamically
+selected subset of input tokens. These nuggets are learned through tasks like
+autoencoding and machine translation, and intuitively segment language into
+meaningful units. We demonstrate Nugget outperforms related approaches in tasks
+involving semantic comparison. Finally, we illustrate these compact units allow
+for expanding the contextual window of a language model (LM), suggesting new
+future LMs that can condition on significantly larger amounts of content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ensemble Distillation for Unsupervised Constituency Parsing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Shayegh, Yanshuai Cao, Xiaodan Zhu, Jackie C. K. Cheung, Lili Mou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the unsupervised constituency parsing task, which organizes
+words and phrases of a sentence into a hierarchical structure without using
+linguistically annotated data. We observe that existing unsupervised parsers
+capture differing aspects of parsing structures, which can be leveraged to
+enhance unsupervised parsing performance. To this end, we propose a notion of
+"tree averaging," based on which we further propose a novel ensemble method for
+unsupervised parsing. To improve inference efficiency, we further distill the
+ensemble knowledge into a student model; such an ensemble-then-distill process
+is an effective approach to mitigate the over-smoothing problem existing in
+common multi-teacher distilling methods. Experiments show that our method
+surpasses all previous approaches, consistently demonstrating its effectiveness
+and robustness across various runs, with different ensemble components, and
+under domain-shift conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deciphering Diagnoses: How Large Language Models Explanations Influence
+  Clinical Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        D. Umerenkov, G. Zubkova, A. Nesterov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical Decision Support Systems (CDSS) utilize evidence-based knowledge and
+patient data to offer real-time recommendations, with Large Language Models
+(LLMs) emerging as a promising tool to generate plain-text explanations for
+medical decisions. This study explores the effectiveness and reliability of
+LLMs in generating explanations for diagnoses based on patient complaints.
+Three experienced doctors evaluated LLM-generated explanations of the
+connection between patient complaints and doctor and model-assigned diagnoses
+across several stages. Experimental results demonstrated that LLM explanations
+significantly increased doctors' agreement rates with given diagnoses and
+highlighted potential errors in LLM outputs, ranging from 5% to 30%. The study
+underscores the potential and challenges of LLMs in healthcare and emphasizes
+the need for careful integration and evaluation to ensure patient safety and
+optimal clinical utility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Residual<span class="highlight-title">Transformer</span>: Residual Low-rank Learning with Weight-sharing for
+  <span class="highlight-title">Transformer</span> Layers <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02489v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02489v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Wang, Jinyu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Memory constraint of always-on devices is one of the major concerns when
+deploying speech processing models on these devices. While larger models
+trained with sufficiently large amount of data generally perform better, making
+them fit in the device memory is a demanding challenge. In this paper, we aim
+to reduce model size by reparameterizing model weights across Transformer
+encoder layers and assuming a special weight composition and structure. More
+specifically, inspired by ResNet and the more recent LoRA work, we propose an
+approach named ResidualTransformer, where each weight matrix in a Transformer
+layer comprises 1) a shared full-rank component with its adjacent layers, and
+2) a unique low-rank component to itself. The low-rank matrices only account
+for a small amount of model size increase. In addition, we add diagonal weight
+matrices to improve modeling capacity of the low-rank matrices. Experiments of
+our 10k-hour speech recognition and speech translation tasks show that the
+Transformer encoder size can be reduced by ~3X with very slight performance
+degradation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE ICASSP 2024. 5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Can Be Good Privacy Protection Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Xiao, Yiqiao Jin, Yushi Bai, Yue Wu, Xianjun Yang, Xiao Luo, Wenchao Yu, Xujiang Zhao, Yanchi Liu, Haifeng Chen, Wei Wang, Wei Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Large Language Models (LLMs) has driven considerable
+interest in fine-tuning them with domain-specific data to create specialized
+language models. Nevertheless, such domain-specific fine-tuning data often
+contains sensitive personally identifiable information (PII). Direct
+fine-tuning LLMs on this data without privacy protection poses a risk of
+leakage. To address this challenge, we introduce Privacy Protection Language
+Models (PPLM), a novel paradigm for fine-tuning LLMs that effectively injects
+domain-specific knowledge while safeguarding data privacy. Our work offers a
+theoretical analysis for model design and delves into various techniques such
+as corpus curation, penalty-based unlikelihood in training loss, and
+instruction-based tuning, etc. Extensive experiments across diverse datasets
+and scenarios demonstrate the effectiveness of our approaches. In particular,
+instruction tuning with both positive and negative examples, stands out as a
+promising method, effectively protecting private data while enhancing the
+model's knowledge. Our work underscores the potential for Large Language Models
+as robust privacy protection learners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Empty Signifier Problem: Towards Clearer Paradigms for
+  Operationalising "Alignment" in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Rose Kirk, Bertie Vidgen, Paul Röttger, Scott A. Hale
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the concept of "alignment" in large language models
+(LLMs) through the lens of post-structuralist socio-political theory,
+specifically examining its parallels to empty signifiers. To establish a shared
+vocabulary around how abstract concepts of alignment are operationalised in
+empirical datasets, we propose a framework that demarcates: 1) which dimensions
+of model behaviour are considered important, then 2) how meanings and
+definitions are ascribed to these dimensions, and by whom. We situate existing
+empirical literature and provide guidance on deciding which paradigm to follow.
+Through this framework, we aim to foster a culture of transparency and critical
+evaluation, aiding the community in navigating the complexities of aligning
+LLMs with human populations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Backdoor Adjustment of Confounding by Provenance for Robust Text
+  Classification of Multi-institutional Clinical Notes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiruo Ding, Zhecheng Sheng, Meliha Yetişgen, Serguei Pakhomov, Trevor Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Processing (NLP) methods have been broadly applied to
+clinical tasks. Machine learning and deep learning approaches have been used to
+improve the performance of clinical NLP. However, these approaches require
+sufficiently large datasets for training, and trained models have been shown to
+transfer poorly across sites. These issues have led to the promotion of data
+collection and integration across different institutions for accurate and
+portable models. However, this can introduce a form of bias called confounding
+by provenance. When source-specific data distributions differ at deployment,
+this may harm model performance. To address this issue, we evaluate the utility
+of backdoor adjustment for text classification in a multi-site dataset of
+clinical notes annotated for mentions of substance abuse. Using an evaluation
+framework devised to measure robustness to distributional shifts, we assess the
+utility of backdoor adjustment. Our results indicate that backdoor adjustment
+can effectively mitigate for confounding shift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AMIA 2023 Annual Symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Resource Languages Jailbreak <span class="highlight-title">GPT</span>-4 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng-Xin Yong, Cristina Menghini, Stephen H. Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI safety training and red-teaming of large language models (LLMs) are
+measures to mitigate the generation of unsafe content. Our work exposes the
+inherent cross-lingual vulnerability of these safety mechanisms, resulting from
+the linguistic inequality of safety training data, by successfully
+circumventing GPT-4's safeguard through translating unsafe English inputs into
+low-resource languages. On the AdvBenchmark, GPT-4 engages with the unsafe
+translated inputs and provides actionable items that can get the users towards
+their harmful goals 79% of the time, which is on par with or even surpassing
+state-of-the-art jailbreaking attacks. Other high-/mid-resource languages have
+significantly lower attack success rate, which suggests that the cross-lingual
+vulnerability mainly applies to low-resource languages. Previously, limited
+training on low-resource languages primarily affects speakers of those
+languages, causing technological disparities. However, our work highlights a
+crucial shift: this deficiency now poses a risk to all LLMs users. Publicly
+available translation APIs enable anyone to exploit LLMs' safety
+vulnerabilities. Therefore, our work calls for a more holistic red-teaming
+efforts to develop robust multilingual safeguards with wide language coverage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Novice Learner and Expert Tutor: Evaluating Math Reasoning Abilities of
+  Large Language Models with Misconceptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naiming Liu, Shashank Sonkar, Zichao Wang, Simon Woodhead, Richard G. Baraniuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose novel evaluations for mathematical reasoning capabilities of Large
+Language Models (LLMs) based on mathematical misconceptions. Our primary
+approach is to simulate LLMs as a novice learner and an expert tutor, aiming to
+identify the incorrect answer to math question resulted from a specific
+misconception and to recognize the misconception(s) behind an incorrect answer,
+respectively. Contrary to traditional LLMs-based mathematical evaluations that
+focus on answering math questions correctly, our approach takes inspirations
+from principles in educational learning sciences. We explicitly ask LLMs to
+mimic a novice learner by answering questions in a specific incorrect manner
+based on incomplete knowledge; and to mimic an expert tutor by identifying
+misconception(s) corresponding to an incorrect answer to a question. Using
+simple grade-school math problems, our experiments reveal that, while LLMs can
+easily answer these questions correctly, they struggle to identify 1) the
+incorrect answer corresponding to specific incomplete knowledge
+(misconceptions); 2) the misconceptions that explain particular incorrect
+answers. Our study indicates new opportunities for enhancing LLMs' math
+reasoning capabilities, especially on developing robust student simulation and
+expert tutoring models in the educational applications such as intelligent
+tutoring systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Large Language Models Provide Security & Privacy Advice? Measuring
+  the Ability of LLMs to Refute Misconceptions <span class="chip">ACSA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Chen, Arjun Arunasalam, Z. Berkay Celik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Users seek security & privacy (S&P) advice from online resources, including
+trusted websites and content-sharing platforms. These resources help users
+understand S&P technologies and tools and suggest actionable strategies. Large
+Language Models (LLMs) have recently emerged as trusted information sources.
+However, their accuracy and correctness have been called into question. Prior
+research has outlined the shortcomings of LLMs in answering multiple-choice
+questions and user ability to inadvertently circumvent model restrictions
+(e.g., to produce toxic content). Yet, the ability of LLMs to provide reliable
+S&P advice is not well-explored. In this paper, we measure their ability to
+refute popular S&P misconceptions that the general public holds. We first study
+recent academic literature to curate a dataset of over a hundred S&P-related
+misconceptions across six different topics. We then query two popular LLMs
+(Bard and ChatGPT) and develop a labeling guide to evaluate their responses to
+these misconceptions. To comprehensively evaluate their responses, we further
+apply three strategies: query each misconception multiple times, generate and
+query their paraphrases, and solicit source URLs of the responses. Both models
+demonstrate, on average, a 21.3% non-negligible error rate, incorrectly
+supporting popular S&P misconceptions. The error rate increases to 32.6% when
+we repeatedly query LLMs with the same or paraphrased misconceptions. We also
+expose that models may partially support a misconception or remain
+noncommittal, refusing a firm stance on misconceptions. Our exploration of
+information sources for responses revealed that LLMs are susceptible to
+providing invalid URLs (21.2% for Bard and 67.7% for ChatGPT) or point to
+unrelated sources (44.2% returned by Bard and 18.3% by ChatGPT).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Annual Computer Security Applications Conference
+  (ACSAC), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can a student Large Language Model perform as well as it's teacher? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sia Gholami, Marwan Omar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning complexity of contemporary deep learning models, while
+achieving unparalleled accuracy, has inadvertently introduced deployment
+challenges in resource-constrained environments. Knowledge distillation, a
+technique aiming to transfer knowledge from a high-capacity "teacher" model to
+a streamlined "student" model, emerges as a promising solution to this dilemma.
+This paper provides a comprehensive overview of the knowledge distillation
+paradigm, emphasizing its foundational principles such as the utility of soft
+labels and the significance of temperature scaling. Through meticulous
+examination, we elucidate the critical determinants of successful distillation,
+including the architecture of the student model, the caliber of the teacher,
+and the delicate balance of hyperparameters. While acknowledging its profound
+advantages, we also delve into the complexities and challenges inherent in the
+process. Our exploration underscores knowledge distillation's potential as a
+pivotal technique in optimizing the trade-off between model performance and
+deployment efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit
+  Quantization and Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young Jin Kim, Raffy Fahim, Hany Hassan Awadalla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Mixture of Experts (MoE) models could achieve state-of-the-art quality
+on various language tasks, including machine translation task, thanks to the
+efficient model scaling capability with expert parallelism. However, it has
+brought a fundamental issue of larger memory consumption and increased memory
+bandwidth bottleneck at deployment time. In this paper, we propose Mixture of
+Quantized Experts (MoQE) which is a simple weight-only quantization method
+applying ultra low-bit down to 2-bit quantizations only to expert weights for
+mitigating the increased memory and latency issues of MoE models. We show that
+low-bit quantization together with the MoE architecture delivers a reliable
+model performance while reducing the memory size significantly even without any
+additional training in most cases. In particular, expert layers in MoE models
+are much more robust to the quantization than conventional feedforward networks
+(FFN) layers. In our comprehensive analysis, we show that MoE models with 2-bit
+expert weights can deliver better model performance than the dense model
+trained on the same dataset. As a result of low-bit quantization, we show the
+model size can be reduced by 79.6% of the original half precision floating
+point (fp16) MoE model. Combined with an optimized GPU runtime implementation,
+it also achieves 1.24X speed-up on A100 GPUs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nugget 2D: Dynamic Contextual Compression for Scaling Decoder-only
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanghui Qin, Corby Rosset, Ethan C. Chau, Nikhil Rao, Benjamin Van Durme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard Transformer-based language models (LMs) scale poorly to long
+contexts. We propose a solution based on dynamic contextual compression, which
+extends the Nugget approach of Qin & Van Durme (2023) from BERT-like frameworks
+to decoder-only LMs. Our method models history as compressed "nuggets" which
+are trained to allow for reconstruction, and it can be initialized with
+off-the-shelf models such as LLaMA. We demonstrate through experiments in
+language modeling, question answering, and summarization that Nugget2D retains
+capabilities in these tasks, while drastically reducing the overhead during
+decoding in terms of time and space. For example, in the experiments of
+autoencoding, Nugget2D can shrink context at a 20x compression ratio with a
+BLEU score of 98% for reconstruction, achieving nearly lossless encoding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. 15 pages and 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MindTheDApp: A Toolchain for Complex Network-Driven Structural Analysis
+  of Ethereum-based Decentralised Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giacomo Ibba, Sabrina Aufiero, Silvia Bartolucci, Rumyana Neykova, Marco Ortu, Roberto Tonelli, Giuseppe Destefanis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents MindTheDApp, a toolchain designed specifically for the
+structural analysis of Ethereum-based Decentralized Applications (DApps), with
+a distinct focus on a complex network-driven approach. Unlike existing tools,
+our toolchain combines the power of ANTLR4 and Abstract Syntax Tree (AST)
+traversal techniques to transform the architecture and interactions within
+smart contracts into a specialized bipartite graph. This enables advanced
+network analytics to highlight operational efficiencies within the DApp's
+architecture.
+  The bipartite graph generated by the proposed tool comprises two sets of
+nodes: one representing smart contracts, interfaces, and libraries, and the
+other including functions, events, and modifiers. Edges in the graph connect
+functions to smart contracts they interact with, offering a granular view of
+interdependencies and execution flow within the DApp. This network-centric
+approach allows researchers and practitioners to apply complex network theory
+in understanding the robustness, adaptability, and intricacies of decentralized
+systems.
+  Our work contributes to the enhancement of security in smart contracts by
+allowing the visualisation of the network, and it provides a deep understanding
+of the architecture and operational logic within DApps. Given the growing
+importance of smart contracts in the blockchain ecosystem and the emerging
+application of complex network theory in technology, our toolchain offers a
+timely contribution to both academic research and practical applications in the
+field of blockchain technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Speech Recognition with N-Skipgram and Positional Unigram
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liming Wang, Mark Hasegawa-Johnson, Chang D. Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training unsupervised speech recognition systems presents challenges due to
+GAN-associated instability, misalignment between speech and text, and
+significant memory demands. To tackle these challenges, we introduce a novel
+ASR system, ESPUM. This system harnesses the power of lower-order N-skipgrams
+(up to N=3) combined with positional unigram statistics gathered from a small
+batch of samples. Evaluated on the TIMIT benchmark, our model showcases
+competitive performance in ASR and phoneme segmentation tasks. Access our
+publicly available code at https://github.com/lwang114/GraphUnsupASR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conversational Health Agents: A Personalized LLM-Powered Agent Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Iman Azimi, Amir M. Rahmani, Ramesh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational Health Agents (CHAs) are interactive systems designed to
+enhance personal healthcare services by engaging in empathetic conversations
+and processing multimodal data. While current CHAs, especially those utilizing
+Large Language Models (LLMs), primarily focus on conversation, they often lack
+comprehensive agent capabilities. This includes the ability to access personal
+user health data from wearables, 24/7 data collection sources, and electronic
+health records, as well as integrating the latest published health insights and
+connecting with established multimodal data analysis tools. We are developing a
+framework to empower CHAs by equipping them with critical thinking, knowledge
+acquisition, and problem-solving abilities. Our CHA platform, powered by LLMs,
+seamlessly integrates healthcare tools, enables multilingual and multimodal
+conversations, and interfaces with a variety of user data analysis tools. We
+illustrate its proficiency in handling complex healthcare tasks, such as stress
+level estimation, showcasing the agent's cognitive and operational
+capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 5 figures, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoNER: Few shot Incremental Learning for Named Entity Recognition
+  using Prototypical Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ritesh Kumar, Saurabh Goyal, Ashish Verma, Vatche Isahagian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Key value pair (KVP) extraction or Named Entity Recognition(NER) from
+visually rich documents has been an active area of research in document
+understanding and data extraction domain. Several transformer based models such
+as LayoutLMv2, LayoutLMv3, and LiLT have emerged achieving state of the art
+results. However, addition of even a single new class to the existing model
+requires (a) re-annotation of entire training dataset to include this new class
+and (b) retraining the model again. Both of these issues really slow down the
+deployment of updated model. \\ We present \textbf{ProtoNER}: Prototypical
+Network based end-to-end KVP extraction model that allows addition of new
+classes to an existing model while requiring minimal number of newly annotated
+training samples. The key contributions of our model are: (1) No dependency on
+dataset used for initial training of the model, which alleviates the need to
+retain original training dataset for longer duration as well as data
+re-annotation which is very time consuming task, (2) No intermediate synthetic
+data generation which tends to add noise and results in model's performance
+degradation, and (3) Hybrid loss function which allows model to retain
+knowledge about older classes as well as learn about newly added classes.\\
+Experimental results show that ProtoNER finetuned with just 30 samples is able
+to achieve similar results for the newly added classes as that of regular model
+finetuned with 2600 samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the definition of toxicity in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Berezin, Reza Farahbakhsh, Noel Crespi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fundamental problem in toxicity detection task lies in the fact that the
+toxicity is ill-defined. Jigsaw, a unit within Google and one of the leaders in
+the field, uses a definition of toxicity given by Dixon et al. - 'rude,
+disrespectful, or unreasonable language that is likely to make someone leave a
+discussion'. One can instantly see the issue with this definition, as it gives
+no quantitative measure of the toxicity and operates with highly subjective
+cultural terms. Despite all vagueness and flaws, this definition is de-facto
+widely used by many researchers. In this work we suggest quantative
+stress-based defenition for the toxicity that overcomes existing shortcomings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Zelikman, Eliana Lorch, Lester Mackey, Adam Tauman Kalai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Several recent advances in AI systems (e.g., Tree-of-Thoughts and
+Program-Aided Language Models) solve problems by providing a "scaffolding"
+program that structures multiple calls to language models to generate better
+outputs. A scaffolding program is written in a programming language such as
+Python. In this work, we use a language-model-infused scaffolding program to
+improve itself. We start with a seed "improver" that improves an input program
+according to a given utility function by querying a language model several
+times and returning the best solution. We then run this seed improver to
+improve itself. Across a small set of downstream tasks, the resulting improved
+improver generates programs with significantly better performance than its seed
+improver. Afterward, we analyze the variety of self-improvement strategies
+proposed by the language model, including beam search, genetic algorithms, and
+simulated annealing. Since the language models themselves are not altered, this
+is not full recursive self-improvement. Nonetheless, it demonstrates that a
+modern language model, GPT-4 in our proof-of-concept experiments, is capable of
+writing code that can call itself to improve itself. We critically consider
+concerns around the development of self-improving technologies and evaluate the
+frequency with which the generated code bypasses a sandbox.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FRMT: A Benchmark for Few-Shot Region-Aware Machine Translation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.00193v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.00193v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parker Riley, Timothy Dozat, Jan A. Botha, Xavier Garcia, Dan Garrette, Jason Riesa, Orhan Firat, Noah Constant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present FRMT, a new dataset and evaluation benchmark for Few-shot
+Region-aware Machine Translation, a type of style-targeted translation. The
+dataset consists of professional translations from English into two regional
+variants each of Portuguese and Mandarin Chinese. Source documents are selected
+to enable detailed analysis of phenomena of interest, including lexically
+distinct terms and distractor terms. We explore automatic evaluation metrics
+for FRMT and validate their correlation with expert human evaluation across
+both region-matched and mismatched rating scenarios. Finally, we present a
+number of baseline models for this task, and offer guidelines for how
+researchers can train, evaluate, and compare their own models. Our dataset and
+evaluation code are publicly available: https://bit.ly/frmt-task
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in TACL Vol. 11 (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Abusing Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Generalization of Training-based Chat<span class="highlight-title">GPT</span> Detection Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01307v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01307v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Xu, Jie Ren, Pengfei He, Shenglai Zeng, Yingqian Cui, Amy Liu, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT is one of the most popular language models which achieve amazing
+performance on various natural language tasks. Consequently, there is also an
+urgent need to detect the texts generated ChatGPT from human written. One of
+the extensively studied methods trains classification models to distinguish
+both. However, existing studies also demonstrate that the trained models may
+suffer from distribution shifts (during test), i.e., they are ineffective to
+predict the generated texts from unseen language tasks or topics. In this work,
+we aim to have a comprehensive investigation on these methods' generalization
+behaviors under distribution shift caused by a wide range of factors, including
+prompts, text lengths, topics, and language tasks. To achieve this goal, we
+first collect a new dataset with human and ChatGPT texts, and then we conduct
+extensive studies on the collected dataset. Our studies unveil insightful
+findings which provide guidance for developing future methodologies or data
+collection strategies for ChatGPT detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Few-Shot Generalization by Exploring and Exploiting Auxiliary
+  Data <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00674v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00674v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alon Albalak, Colin Raffel, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning is valuable in many real-world applications, but learning a
+generalizable model without overfitting to the few labeled datapoints is
+challenging. In this work, we focus on Few-shot Learning with Auxiliary Data
+(FLAD), a training paradigm that assumes access to auxiliary data during
+few-shot learning in hopes of improving generalization. Previous works have
+proposed automated methods for mixing auxiliary and target data, but these
+methods typically scale linearly (or worse) with the number of auxiliary
+datasets, limiting their practicality. In this work we relate FLAD to the
+explore-exploit dilemma that is central to the multi-armed bandit setting and
+derive algorithms whose computational complexity is independent of the number
+of auxiliary datasets, allowing us to scale to 100x more auxiliary datasets
+than prior methods. We propose two algorithms -- EXP3-FLAD and UCB1-FLAD -- and
+compare them with prior FLAD methods that either explore or exploit, finding
+that the combination of exploration and exploitation is crucial. Through
+extensive experimentation we find that our methods outperform all pre-existing
+FLAD methods by 4% and lead to the first 3 billion parameter language models
+that outperform the 175 billion parameter GPT-3. Overall, our work suggests
+that the discovery of better, more efficient mixing strategies for FLAD may
+provide a viable path towards substantially improving generalization in
+few-shot learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023, 25 pages, 8 figures, code available at
+  https://github.com/alon-albalak/FLAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world
+  APIs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, Sihan Zhao, Lauren Hong, Runchu Tian, Ruobing Xie, Jie Zhou, Mark Gerstein, Dahai Li, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advancements of open-source large language models (LLMs), e.g.,
+LLaMA, they remain significantly limited in tool-use capabilities, i.e., using
+external tools (APIs) to fulfill human instructions. The reason is that current
+instruction tuning largely focuses on basic language tasks but ignores the
+tool-use domain. This is in contrast to the excellent tool-use capabilities of
+state-of-the-art (SOTA) closed-source LLMs, e.g., ChatGPT. To bridge this gap,
+we introduce ToolLLM, a general tool-use framework encompassing data
+construction, model training, and evaluation. We first present ToolBench, an
+instruction-tuning dataset for tool use, which is constructed automatically
+using ChatGPT. Specifically, the construction can be divided into three stages:
+(i) API collection: we collect 16,464 real-world RESTful APIs spanning 49
+categories from RapidAPI Hub; (ii) instruction generation: we prompt ChatGPT to
+generate diverse instructions involving these APIs, covering both single-tool
+and multi-tool scenarios; (iii) solution path annotation: we use ChatGPT to
+search for a valid solution path (chain of API calls) for each instruction. To
+enhance the reasoning capabilities of LLMs, we develop a novel depth-first
+search-based decision tree algorithm. It enables LLMs to evaluate multiple
+reasoning traces and expand the search space. Moreover, to evaluate the
+tool-use capabilities of LLMs, we develop an automatic evaluator: ToolEval.
+Based on ToolBench, we fine-tune LLaMA to obtain an LLM ToolLLaMA, and equip it
+with a neural API retriever to recommend appropriate APIs for each instruction.
+Experiments show that ToolLLaMA demonstrates a remarkable ability to execute
+complex instructions and generalize to unseen APIs, and exhibits comparable
+performance to ChatGPT. Our ToolLLaMA also demonstrates strong zero-shot
+generalization ability in an out-of-distribution tool-use dataset: APIBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TRAM: Benchmarking Temporal Reasoning for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqing Wang, Yun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning about time is essential for understanding the nuances of events
+described in natural language. Previous research on this topic has been limited
+in scope, characterized by a lack of standardized benchmarks that would allow
+for consistent evaluations across different studies. In this paper, we
+introduce TRAM, a temporal reasoning benchmark composed of ten datasets,
+encompassing various temporal aspects of events such as order, arithmetic,
+frequency, and duration, designed to facilitate a comprehensive evaluation of
+the temporal reasoning capabilities of large language models (LLMs). We conduct
+an extensive evaluation using popular LLMs, such as GPT-4 and Llama2, in both
+zero-shot and few-shot learning scenarios. Additionally, we employ BERT-based
+models to establish the baseline evaluations. Our findings indicate that these
+models still trail human performance in temporal reasoning tasks. It is our
+aspiration that TRAM will spur further progress in enhancing the temporal
+reasoning abilities of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, in submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ appjsonify: An Academic Paper PDF-to-JSON Conversion Toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atsuki Yamaguchi, Terufumi Morishita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present appjsonify, a Python-based PDF-to-JSON conversion toolkit for
+academic papers. It parses a PDF file using several visual-based document
+layout analysis models and rule-based text processing approaches. appjsonify is
+a flexible tool that allows users to easily configure the processing pipeline
+to handle a specific format of a paper they wish to process. We are publicly
+releasing appjsonify as an easy-to-install toolkit available via PyPI and
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. PyPI: https://pypi.org/project/appjsonify/ GitHub:
+  https://pypi.org/project/appjsonify/. Fixed Figure 1 containing paper PDF
+  examples</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain-of-Knowledge: Grounding Large Language Models via Dynamic
+  Knowledge Adapting over Heterogeneous Sources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxuan Li, Ruochen Zhao, Yew Ken Chia, Bosheng Ding, Shafiq Joty, Soujanya Poria, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present chain-of-knowledge (CoK), a novel framework that augments large
+language models (LLMs) by dynamically incorporating grounding information from
+heterogeneous sources. It results in more factual rationales and reduced
+hallucination in generation. Specifically, CoK consists of three stages:
+reasoning preparation, dynamic knowledge adapting, and answer consolidation.
+Given a knowledge-intensive question, CoK first prepares several preliminary
+rationales and answers while identifying the relevant knowledge domains. If
+there is no majority consensus among the answers from samples, CoK corrects the
+rationales step by step by adapting knowledge from the identified domains.
+These corrected rationales can plausibly serve as a better foundation for the
+final answer consolidation. Unlike prior studies that primarily use
+unstructured data, CoK also leverages structured knowledge sources such as
+Wikidata and tables that provide more reliable factual information. To access
+both unstructured and structured knowledge sources in the dynamic knowledge
+adapting stage, we propose an adaptive query generator that allows the
+generation of queries for various types of query languages, including SPARQL,
+SQL, and natural sentences. Moreover, to minimize error propagation between
+rationales, CoK corrects the rationales progressively using preceding corrected
+rationales to generate and correct subsequent rationales. Extensive experiments
+show that CoK consistently improves the performance of LLMs on
+knowledge-intensive tasks across different domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Engineering: A Top-Down Approach to AI Transparency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Long Phan, Sarah Chen, James Campbell, Phillip Guo, Richard Ren, Alexander Pan, Xuwang Yin, Mantas Mazeika, Ann-Kathrin Dombrowski, Shashwat Goel, Nathaniel Li, Michael J. Byun, Zifan Wang, Alex Mallen, Steven Basart, Sanmi Koyejo, Dawn Song, Matt Fredrikson, J. Zico Kolter, Dan Hendrycks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we identify and characterize the emerging area of
+representation engineering (RepE), an approach to enhancing the transparency of
+AI systems that draws on insights from cognitive neuroscience. RepE places
+population-level representations, rather than neurons or circuits, at the
+center of analysis, equipping us with novel methods for monitoring and
+manipulating high-level cognitive phenomena in deep neural networks (DNNs). We
+provide baselines and an initial analysis of RepE techniques, showing that they
+offer simple yet effective solutions for improving our understanding and
+control of large language models. We showcase how these methods can provide
+traction on a wide range of safety-relevant problems, including honesty,
+harmlessness, power-seeking, and more, demonstrating the promise of top-down
+transparency research. We hope that this work catalyzes further exploration of
+RepE and fosters advancements in the transparency and safety of AI systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at
+  https://github.com/andyzoujm/representation-engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning Learns Label Relationships but Is Not Conventional
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Yarin Gal, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The predictions of Large Language Models (LLMs) on downstream tasks often
+improve significantly when including examples of the input--label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works. For example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we provide novel insights into how ICL leverages label information,
+revealing both capabilities and limitations. To ensure we obtain a
+comprehensive picture of ICL behavior, we study probabilistic aspects of ICL
+predictions and thoroughly examine the dynamics of ICL as more examples are
+provided. Our experiments show that ICL predictions almost always depend on
+in-context labels, and that ICL can learn truly novel tasks in-context.
+However, we also find that ICL struggles to fully overcome prediction
+preferences acquired from pre-training data, and, further, that ICL does not
+consider all in-context information equally.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wavelet Scattering Transform for Improving Generalization in
+  Low-Resourced Spoken Language Identification <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00602v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00602v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spandan Dey, Premjeet Singh, Goutam Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonly used features in spoken language identification (LID), such as
+mel-spectrogram or MFCC, lose high-frequency information due to windowing. The
+loss further increases for longer temporal contexts. To improve generalization
+of the low-resourced LID systems, we investigate an alternate feature
+representation, wavelet scattering transform (WST), that compensates for the
+shortcomings. To our knowledge, WST is not explored earlier in LID tasks. We
+first optimize WST features for multiple South Asian LID corpora. We show that
+LID requires low octave resolution and frequency-scattering is not useful.
+Further, cross-corpora evaluations show that the optimal WST hyper-parameters
+depend on both train and test corpora. Hence, we develop fused ECAPA-TDNN based
+LID systems with different sets of WST hyper-parameters to improve
+generalization for unknown data. Compared to MFCC, EER is reduced upto 14.05%
+and 6.40% for same-corpora and blind VoxLingua107 evaluations, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented in INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMatic: Neural Architecture Search via Large Language Models and
+  Quality Diversity Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01102v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01102v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad U. Nasir, Sam Earle, Julian Togelius, Steven James, Christopher Cleghorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as powerful tools capable of
+accomplishing a broad spectrum of tasks. Their abilities span numerous areas,
+and one area where they have made a significant impact is in the domain of code
+generation. In this context, we view LLMs as mutation and crossover tools.
+Meanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and
+robust solutions. By merging the code-generating abilities of LLMs with the
+diversity and robustness of QD solutions, we introduce LLMatic, a Neural
+Architecture Search (NAS) algorithm. While LLMs struggle to conduct NAS
+directly through prompts, LLMatic uses a procedural approach, leveraging QD for
+prompts and network architecture to create diverse and highly performant
+networks. We test LLMatic on the CIFAR-10 image classification benchmark,
+demonstrating that it can produce competitive networks with just $2,000$
+searches, even without prior knowledge of the benchmark domain or exposure to
+any previous top-performing models for the benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Race Detection Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Chen, Xianzhong Ding, Murali Emani, Tristan Vanderbruggen, Pei-hung Lin, Chuanhua Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are demonstrating significant promise as an
+alternate strategy to facilitate analyses and optimizations of high-performance
+computing programs, circumventing the need for resource-intensive manual tool
+creation. In this paper, we explore a novel LLM-based data race detection
+approach combining prompting engineering and fine-tuning techniques. We create
+a dedicated dataset named DRB-ML, which is derived from DataRaceBench, with
+fine-grain labels showing the presence of data race pairs and their associated
+variables, line numbers, and read/write information. DRB-ML is then used to
+evaluate representative LLMs and fine-tune open-source ones. Our experiment
+shows that LLMs can be a viable approach to data race detection. However, they
+still cannot compete with traditional data race detection tools when we need
+detailed information about variable pairs causing data races.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Knowledge Distillation for Auto-regressive Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, Olivier Bachem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) is widely used for compressing a teacher model to
+reduce its inference cost and memory footprint, by training a smaller student
+model. However, current KD methods for auto-regressive sequence models suffer
+from distribution mismatch between output sequences seen during training and
+those generated by the student during inference. To address this issue, we
+introduce Generalized Knowledge Distillation (GKD). Instead of solely relying
+on a fixed set of output sequences, GKD trains the student on its
+self-generated output sequences by leveraging feedback from the teacher on such
+sequences. Unlike supervised KD approaches, GKD also offers the flexibility to
+employ alternative loss functions between the student and teacher, which can be
+useful when the student lacks the expressivity to mimic the teacher's
+distribution. Furthermore, GKD facilitates the seamless integration of
+distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for
+distilling auto-regressive T5 language models on summarization, translation,
+and arithmetic reasoning tasks as well as task-agnostic instruction tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally. Added new results and
+  experiment details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DialoGen: Generalized Long-Range Context Representation for Dialogue
+  Systems <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06282v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06282v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suvodip Dey, Maunendra Sankar Desarkar, Asif Ekbal, P. K. Srijith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-range context modeling is crucial to both dialogue understanding and
+generation. The most popular method for dialogue context representation is to
+concatenate the last-$k$ utterances in chronological order. However, this
+method may not be ideal for conversations containing long-range dependencies,
+i.e., when there is a need to look beyond last-$k$ utterances to generate a
+meaningful response. In this work, we propose DialoGen, a novel encoder-decoder
+based framework for dialogue generation with a generalized context
+representation that can look beyond the last-$k$ utterances. The main idea of
+our approach is to identify and utilize the most relevant historical utterances
+instead of last-$k$, which also enables the compact representation of dialogue
+history with fewer tokens. We study the effectiveness of our proposed method on
+both dialogue generation (open-domain) and understanding (DST). Even with a
+compact context representation, DialoGen performs comparably to the
+state-of-the-art models on the open-domain DailyDialog dataset. We observe a
+similar behavior on the DST task of the MultiWOZ dataset when the proposed
+context representation is applied to existing DST models. We also discuss the
+generalizability and interpretability of DialoGen and show that the relevance
+score of previous utterances agrees well with human cognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at PACLIC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intuitive or Dependent? Investigating LLMs' Robustness to Conflicting
+  <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Ying, Yixin Cao, Kai Xiong, Yidong He, Long Cui, Yongbin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the robustness of LLMs' preference to their internal
+memory or the given prompt, which may contain contrasting information in
+real-world applications due to noise or task settings. To this end, we
+establish a quantitative benchmarking framework and conduct the role playing
+intervention to control LLMs' preference. In specific, we define two types of
+robustness, factual robustness targeting the ability to identify the correct
+fact from prompts or memory, and decision style to categorize LLMs' behavior in
+making consistent choices -- assuming there is no definitive "right" answer --
+intuitive, dependent, or rational based on cognitive theory. Our findings,
+derived from extensive experiments on seven open-source and closed-source LLMs,
+reveal that these models are highly susceptible to misleading prompts,
+especially for instructing commonsense knowledge. While detailed instructions
+can mitigate the selection of misleading answers, they also increase the
+incidence of invalid responses. After Unraveling the preference, we intervene
+different sized LLMs through specific style of role instruction, showing their
+varying upper bound of robustness and adaptivity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain of Hindsight Aligns Language Models with Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02676v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02676v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Carmelo Sferrazza, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from human preferences is important for language models to match
+human needs and to align with human and social values. Prior works have
+achieved remarkable successes by learning from human feedback to understand and
+follow instructions. Nonetheless, these methods are either founded on
+hand-picked model generations that are favored by human annotators, rendering
+them inefficient in terms of data utilization and challenging to apply in
+general, or they depend on reinforcement learning, which often suffers from
+imperfect reward functions and relies on extremely challenging optimizations.
+In this work, we propose a novel technique, Chain of Hindsight, that is easy to
+optimize and can learn from any form of feedback, regardless of its polarity.
+Our idea is inspired by how humans learn from extensive feedback presented in
+the form of languages. We convert all types of feedback into sequences of
+sentences, which are then used to fine-tune the model, allowing us to take
+advantage of the language comprehension capabilities of language models. We
+condition the model on a sequence of model generations paired with feedback. By
+doing so, the model is trained to generate outputs based on feedback, while
+learning to identify and correct negative attributes or errors. Applying our
+method to large language models, we observed that Chain of Hindsight
+significantly surpasses previous methods in aligning language models with human
+preferences. We report significant improvements on summarization and dialogue
+benchmarks, with our approach markedly preferred in human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Modal Retrieval for Motion and Text via DopTriple Loss <span class="chip">ACM MM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Yan, Yang Liu, Haoqiang Wang, Xin Du, Mengyuan Liu, Hong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval of image-text and video-text is a prominent research
+area in computer vision and natural language processing. However, there has
+been insufficient attention given to cross-modal retrieval between human motion
+and text, despite its wide-ranging applicability. To address this gap, we
+utilize a concise yet effective dual-unimodal transformer encoder for tackling
+this task. Recognizing that overlapping atomic actions in different human
+motion sequences can lead to semantic conflicts between samples, we explore a
+novel triplet loss function called DropTriple Loss. This loss function discards
+false negative samples from the negative sample set and focuses on mining
+remaining genuinely hard negative samples for triplet training, thereby
+reducing violations they cause. We evaluate our model and approach on the
+HumanML3D and KIT Motion-Language datasets. On the latest HumanML3D dataset, we
+achieve a recall of 62.9% for motion retrieval and 71.5% for text retrieval
+(both based on R@10). The source code for our approach is publicly available at
+https://github.com/eanson023/rehamot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by ACM MM Asia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JoMA: Demystifying Multilayer <span class="highlight-title">Transformer</span>s via JOint Dynamics of MLP and
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuandong Tian, Yiping Wang, Zhenyu Zhang, Beidi Chen, Simon Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Joint MLP/Attention (JoMA) dynamics, a novel mathematical
+framework to understand the training procedure of multilayer Transformer
+architectures. This is achieved by integrating out the self-attention layer in
+Transformers, producing a modified dynamics of MLP layers only. JoMA removes
+unrealistic assumptions in previous analysis (e.g., lack of residual
+connection) and predicts that the attention first becomes sparse (to learn
+salient tokens), then dense (to learn less salient tokens) in the presence of
+nonlinear activations, while in the linear case, it is consistent with existing
+works that show attention becomes sparse over time. We leverage JoMA to
+qualitatively explains how tokens are combined to form hierarchies in
+multilayer Transformers, when the input tokens are generated by a latent
+hierarchical generative model. Experiments on models trained from real-world
+dataset (Wikitext2/Wikitext103) and various pre-trained models (OPT, Pythia)
+verify our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web automation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that learns from self-experience to
+complete tasks on real websites following natural language instructions.
+WebAgent plans ahead by decomposing instructions into canonical
+sub-instructions, summarizes long HTML documents into task-relevant snippets,
+and acts on websites via Python programs generated from those. We design
+WebAgent with Flan-U-PaLM, for grounded code generation, and HTML-T5, new
+pre-trained LLMs for long HTML documents using local and global attention
+mechanisms and a mixture of long-span denoising objectives, for planning and
+summarization. We empirically demonstrate that our modular recipe improves the
+success on real websites by over 50%, and that HTML-T5 is the best model to
+solve various HTML understanding tasks; achieving 18.7% higher success rate
+than the prior method on MiniWoB web automation benchmark, and SoTA performance
+on Mind2Web, an offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ "What do others think?": Task-Oriented Conversational Modeling with
+  Subjective Knowledge <span class="chip">SIGDIAL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Zhao, Spandana Gella, Seokhwan Kim, Di Jin, Devamanyu Hazarika, Alexandros Papangelis, Behnam Hedayatnia, Mahdi Namazifar, Yang Liu, Dilek Hakkani-Tur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented Dialogue (TOD) Systems aim to build dialogue systems that
+assist users in accomplishing specific goals, such as booking a hotel or a
+restaurant. Traditional TODs rely on domain-specific APIs/DBs or external
+factual knowledge to generate responses, which cannot accommodate subjective
+user requests (e.g., "Is the WIFI reliable?" or "Does the restaurant have a
+good atmosphere?"). To address this issue, we propose a novel task of
+subjective-knowledge-based TOD (SK-TOD). We also propose the first
+corresponding dataset, which contains subjective knowledge-seeking dialogue
+contexts and manually annotated responses grounded in subjective knowledge
+sources. When evaluated with existing TOD approaches, we find that this task
+poses new challenges such as aggregating diverse opinions from multiple
+knowledge snippets. We hope this task and dataset can promote further research
+on TOD and subjective content understanding. The code and the dataset are
+available at https://github.com/alexa/dstc11-track5.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGDIAL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAmmoTH: Building Math Generalist Models through Hybrid Instruction
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05653v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05653v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Yue, Xingwei Qu, Ge Zhang, Yao Fu, Wenhao Huang, Huan Sun, Yu Su, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MAmmoTH, a series of open-source large language models (LLMs)
+specifically tailored for general math problem-solving. The MAmmoTH models are
+trained on MathInstruct, our meticulously curated instruction tuning dataset.
+MathInstruct is compiled from 13 math datasets with intermediate rationales,
+six of which have rationales newly curated by us. It presents a unique hybrid
+of chain-of-thought (CoT) and program-of-thought (PoT) rationales, and also
+ensures extensive coverage of diverse fields in math. The hybrid of CoT and PoT
+not only unleashes the potential of tool use but also allows different thought
+processes for different math problems. As a result, the MAmmoTH series
+substantially outperform existing open-source models on nine mathematical
+reasoning datasets across all scales with an average accuracy gain between 16%
+and 32%. Remarkably, our MAmmoTH-7B model reaches 33% on MATH (a
+competition-level dataset), which exceeds the best open-source 7B model
+(WizardMath) by 23%, and the MAmmoTH-34B model achieves 44% accuracy on MATH,
+even surpassing GPT-4's CoT result. Our work underscores the importance of
+diverse problem coverage and the use of hybrid rationales in developing
+superior math generalist models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; Xiang Yue and Wenhu Chen contributed equally to
+  this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Chameleon or Stubborn Sloth: Revealing the Behavior of Large
+  Language Models in Knowledge Conflicts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13300v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13300v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Xie, Kai Zhang, Jiangjie Chen, Renze Lou, Yu Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By providing external information to large language models (LLMs), tool
+augmentation (including retrieval augmentation) has emerged as a promising
+solution for addressing the limitations of LLMs' static parametric memory.
+However, how receptive are LLMs to such external evidence, especially when the
+evidence conflicts with their parametric memory? We present the first
+comprehensive and controlled investigation into the behavior of LLMs when
+encountering knowledge conflicts. We propose a systematic framework to elicit
+high-quality parametric memory from LLMs and construct the corresponding
+counter-memory, which enables us to conduct a series of controlled experiments.
+Our investigation reveals seemingly contradicting behaviors of LLMs. On the one
+hand, different from prior wisdom, we find that LLMs can be highly receptive to
+external evidence even when that conflicts with their parametric memory, given
+that the external evidence is coherent and convincing. On the other hand, LLMs
+also demonstrate a strong confirmation bias when the external evidence contains
+some information that is consistent with their parametric memory, despite being
+presented with conflicting evidence at the same time. These results pose
+important implications that are worth careful consideration for the further
+development and deployment of tool- and retrieval-augmented LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LM-Infinite: Simple On-the-Fly Length Generalization for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16137v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16137v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Han, Qifan Wang, Wenhan Xiong, Yu Chen, Heng Ji, Sinong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there have been remarkable advancements in the performance
+of Transformer-based Large Language Models (LLMs) across various domains. As
+these LLMs are deployed for increasingly complex domains, they often face the
+need to follow longer user prompts or generate longer texts. In these
+situations, the $\textit{length generalization failure}$ of LLMs on long
+sequences becomes more prominent. Most pre-training schemes truncate training
+sequences to a fixed length. LLMs often struggle to generate fluent and
+coherent texts after longer contexts, even with relative positional encoding
+specifically designed to cope with this problem. Common solutions such as
+finetuning on longer corpora often involve daunting hardware and time costs and
+require careful training process design. To more efficiently extrapolate
+existing LLMs' generation quality to longer texts, we theoretically and
+empirically investigate the main out-of-distribution (OOD) factors contributing
+to this problem. Inspired by this diagnosis, we propose a simple yet effective
+solution for on-the-fly length generalization, LM-Infinite. It involves only a
+$\mathbf{\Lambda}$-shaped attention mask (to avoid excessive attended tokens)
+and a distance limit (to avoid unseen distances) while requiring no parameter
+updates or learning. We find it applicable to a variety of LLMs using
+relative-position encoding methods. LM-Infinite is computationally efficient
+with $O(n)$ time and space, and demonstrates consistent text generation fluency
+and quality to as long as 128k tokens on ArXiv and OpenWebText2 datasets, with
+2.72x decoding speedup. We will make the codes publicly available following
+publication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Discrete and Backpropagation: Straight-Through and Beyond <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Liu, Chengyu Dong, Xiaodong Liu, Bin Yu, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backpropagation, the cornerstone of deep learning, is limited to computing
+gradients for continuous variables. This limitation poses challenges for
+problems involving discrete latent variables. To address this issue, we propose
+a novel approach to approximate the gradient of parameters involved in
+generating discrete latent variables. First, we examine the widely used
+Straight-Through (ST) heuristic and demonstrate that it works as a first-order
+approximation of the gradient. Guided by our findings, we propose ReinMax,
+which achieves second-order accuracy by integrating Heun's method, a
+second-order numerical method for solving ODEs. ReinMax does not require
+Hessian or other second-order derivatives, thus having negligible computation
+overheads. Extensive experimental results on various tasks demonstrate the
+superiority of ReinMax over the state of the art. Implementations are released
+at https://github.com/microsoft/ReinMax.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RecallM: An Adaptable Memory Mechanism with Temporal Understanding for
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02738v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02738v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Kynoch, Hugo Latapie, Dwane van der Sluis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made extraordinary progress in the field of
+Artificial Intelligence and have demonstrated remarkable capabilities across a
+large variety of tasks and domains. However, as we venture closer to creating
+Artificial General Intelligence (AGI) systems, we recognize the need to
+supplement LLMs with long-term memory to overcome the context window limitation
+and more importantly, to create a foundation for sustained reasoning,
+cumulative learning and long-term user interaction. In this paper we propose
+RecallM, a novel architecture for providing LLMs with an adaptable and
+updatable long-term memory mechanism. Unlike previous methods, the RecallM
+architecture is particularly effective at belief updating and maintaining a
+temporal understanding of the knowledge provided to it. We demonstrate through
+various experiments the effectiveness of this architecture. Furthermore,
+through our own temporal understanding and belief updating experiments, we show
+that RecallM is four times more effective than using a vector database for
+updating knowledge previously stored in long-term memory. We also demonstrate
+that RecallM shows competitive performance on general question-answering and
+in-context learning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, 1 table, Our code is publicly available online
+  at: https://github.com/cisco-open/DeepVision/tree/main/recallm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alt-Text with Context: Improving Accessibility for Images on Twitter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14779v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14779v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Srivatsan, Sofia Samaniego, Omar Florez, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we present an approach for generating alternative text (or
+alt-text) descriptions for images shared on social media, specifically Twitter.
+More than just a special case of image captioning, alt-text is both more
+literally descriptive and context-specific. Also critically, images posted to
+Twitter are often accompanied by user-written text that despite not necessarily
+describing the image may provide useful context that if properly leveraged can
+be informative. We address this task with a multimodal model that conditions on
+both textual information from the associated social media post as well as
+visual signal from the image, and demonstrate that the utility of these two
+information sources stacks. We put forward a new dataset of 371k images paired
+with alt-text and tweets scraped from Twitter and evaluate on it across a
+variety of automated metrics as well as human evaluation. We show that our
+approach of conditioning on both tweet text and visual information
+significantly outperforms prior work, by more than 2x on BLEU@4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Large-Scale Analysis of Persian Tweets Regarding Covid-19 Vaccination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha ShabaniMirzaei, Houmaan Chamani, Amirhossein Abaskohi, Zhivar Sourati Hassan Zadeh, Behnam Bahrak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Covid-19 pandemic had an enormous effect on our lives, especially on
+people's interactions. By introducing Covid-19 vaccines, both positive and
+negative opinions were raised over the subject of taking vaccines or not. In
+this paper, using data gathered from Twitter, including tweets and user
+profiles, we offer a comprehensive analysis of public opinion in Iran about the
+Coronavirus vaccines. For this purpose, we applied a search query technique
+combined with a topic modeling approach to extract vaccine-related tweets. We
+utilized transformer-based models to classify the content of the tweets and
+extract themes revolving around vaccination. We also conducted an emotion
+analysis to evaluate the public happiness and anger around this topic. Our
+results demonstrate that Covid-19 vaccination has attracted considerable
+attention from different angles, such as governmental issues, safety or
+hesitancy, and side effects. Moreover, Coronavirus-relevant phenomena like
+public vaccination and the rate of infection deeply impacted public emotional
+status and users' interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alternating Updates for Efficient <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13310v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13310v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cenk Baykal, Dylan Cutler, Nishanth Dikkala, Nikhil Ghosh, Rina Panigrahy, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been well established that increasing scale in deep transformer
+networks leads to improved quality and performance. However, this increase in
+scale often comes with prohibitive increases in compute cost and inference
+latency. We introduce Alternating Updates (AltUp), a simple-to-implement method
+to increase a model's capacity without the computational burden. AltUp enables
+the widening of the learned representation, i.e., the token embedding, while
+only incurring a negligible increase in latency. AltUp achieves this by working
+on a subblock of the widened representation at each layer and using a
+predict-and-correct mechanism to update the inactivated blocks. We present
+extensions of AltUp, such as its applicability to the sequence dimension, and
+demonstrate how AltUp can be synergistically combined with existing approaches,
+such as Sparse Mixture-of-Experts models, to obtain efficient models with even
+higher capacity. Our experiments on benchmark transformer models and language
+tasks demonstrate the consistent effectiveness of AltUp on a diverse set of
+scenarios. Notably, on SuperGLUE and SQuAD benchmarks, AltUp enables up to
+$87\%$ speedup relative to the dense baselines at the same accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08155v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08155v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Beibin Li, Erkang Zhu, Li Jiang, Xiaoyun Zhang, Shaokun Zhang, Jiale Liu, Ahmed Hassan Awadallah, Ryen W White, Doug Burger, Chi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AutoGen is an open-source framework that allows developers to build LLM
+applications via multiple agents that can converse with each other to
+accomplish tasks. AutoGen agents are customizable, conversable, and can operate
+in various modes that employ combinations of LLMs, human inputs, and tools.
+Using AutoGen, developers can also flexibly define agent interaction behaviors.
+Both natural language and computer code can be used to program flexible
+conversation patterns for different applications. AutoGen serves as a generic
+infrastructure to build diverse applications of various complexities and LLM
+capacities. Empirical studies demonstrate the effectiveness of the framework in
+many example applications, with domains ranging from mathematics, coding,
+question answering, operations research, online decision-making, entertainment,
+etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages (10 pages for the main text, 3 pages for references, and 30
+  pages for appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Catastrophic Forgetting in Multimodal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10313v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10313v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuexiang Zhai, Shengbang Tong, Xiao Li, Mu Cai, Qing Qu, Yong Jae Lee, Yi Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the success of GPT4, there has been a surge in interest in
+multimodal large language model (MLLM) research. This line of research focuses
+on developing general-purpose LLMs through fine-tuning pre-trained LLMs and
+vision models. However, catastrophic forgetting, a notorious phenomenon where
+the fine-tuned model fails to retain similar performance compared to the
+pre-trained model, still remains an inherent problem in multimodal LLMs (MLLM).
+In this paper, we introduce EMT: Evaluating MulTimodality for evaluating the
+catastrophic forgetting in MLLMs, by treating each MLLM as an image classifier.
+We first apply EMT to evaluate several open-source fine-tuned MLLMs and we
+discover that almost all evaluated MLLMs fail to retain the same performance
+levels as their vision encoders on standard image classification tasks.
+Moreover, we continue fine-tuning LLaVA, an MLLM and utilize EMT to assess
+performance throughout the fine-tuning. Interestingly, our results suggest that
+early-stage fine-tuning on an image dataset improves performance across other
+image datasets, by enhancing the alignment of text and visual features.
+However, as fine-tuning proceeds, the MLLMs begin to hallucinate, resulting in
+a significant loss of generalizability, even when the image encoder remains
+frozen. Our results suggest that MLLMs have yet to demonstrate performance on
+par with their vision models on standard image classification tasks and the
+current MLLM fine-tuning procedure still has room for improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deductive Verification of Chain-of-Thought Reasoning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03872v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03872v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhan Ling, Yunhao Fang, Xuanlin Li, Zhiao Huang, Mingu Lee, Roland Memisevic, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) significantly benefit from Chain-of-Thought
+(CoT) prompting in performing various reasoning tasks. While CoT allows models
+to produce more comprehensive reasoning processes, its emphasis on intermediate
+reasoning steps can inadvertently introduce hallucinations and accumulated
+errors, thereby limiting models' ability to solve complex reasoning tasks.
+Inspired by how humans engage in careful and meticulous deductive logical
+reasoning processes to solve tasks, we seek to enable language models to
+perform explicit and rigorous deductive reasoning, and also ensure the
+trustworthiness of their reasoning process through self-verification. However,
+directly verifying the validity of an entire deductive reasoning process is
+challenging, even with advanced models like ChatGPT. In light of this, we
+propose to decompose a reasoning verification process into a series of
+step-by-step subprocesses, each only receiving their necessary context and
+premises. To facilitate this procedure, we propose Natural Program, a natural
+language-based deductive reasoning format. Our approach enables models to
+generate precise reasoning steps where subsequent steps are more rigorously
+grounded on prior steps. It also empowers language models to carry out
+reasoning self-verification in a step-by-step manner. By integrating this
+verification process into each deductive reasoning stage, we significantly
+enhance the rigor and trustfulness of generated reasoning steps. Along this
+process, we also improve the answer correctness on complex reasoning tasks.
+Code will be released at https://github.com/lz1oceani/verify_cot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Parameter-Efficient Learning Approach to Arabic Dialect Identification
+  with <span class="highlight-title">Pre-Train</span>ed General-Purpose Speech Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11244v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11244v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srijith Radhakrishnan, Chao-Han Huck Yang, Sumeer Ahmad Khan, Narsis A. Kiani, David Gomez-Cabrero, Jesper N. Tegner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore Parameter-Efficient-Learning (PEL) techniques to
+repurpose a General-Purpose-Speech (GSM) model for Arabic dialect
+identification (ADI). Specifically, we investigate different setups to
+incorporate trainable features into a multi-layer encoder-decoder GSM
+formulation under frozen pre-trained settings. Our architecture includes
+residual adapter and model reprogramming (input-prompting). We design a
+token-level label mapping to condition the GSM for Arabic Dialect
+Identification (ADI). This is challenging due to the high variation in
+vocabulary and pronunciation among the numerous regional dialects. We achieve
+new state-of-the-art accuracy on the ADI-17 dataset by vanilla fine-tuning. We
+further reduce the training budgets with the PEL method, which performs within
+1.86% accuracy to fine-tuning using only 2.5% of (extra) network trainable
+parameters. Our study demonstrates how to identify Arabic dialects using a
+small dataset and limited computation with open source code and pre-trained
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Interspeech 2023, 5 pages. Code is available at:
+  https://github.com/Srijith-rkr/KAUST-Whisper-Adapter under MIT license</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Double Equivariance for Inductive Link Prediction for Both New Nodes and
+  New Relation Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01313v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01313v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfei Gao, Yangze Zhou, Jincheng Zhou, Bruno Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of inductive link prediction in discrete attributed multigraphs
+(e.g., knowledge graphs, multilayer networks, heterogeneous networks, etc.)
+generally focuses on test predictions with solely new nodes but not both new
+nodes and new relation types. In this work, we formally define the task of
+predicting (completely) new nodes and new relation types in test as a doubly
+inductive link prediction task and introduce a theoretical framework for the
+solution. We start by defining the concept of double permutation-equivariant
+representations that are equivariant to permutations of both node identities
+and edge relation types. We then propose a general blueprint to design neural
+architectures that impose a structural representation of relations that can
+inductively generalize from training nodes and relations to arbitrarily new
+test nodes and relations without the need for adaptation, side information, or
+retraining. We also introduce the concept of distributionally double
+equivariant positional embeddings designed to perform the same task. Finally,
+we empirically demonstrate the capability of the two proposed models on a set
+of novel real-world benchmarks, showcasing average relative performance gains
+of $39.65\%$ on predicting new relations types compared to baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AWQ: Activation-aware Weight Quantization for LLM Compression and
+  Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, Chuang Gan, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown excellent performance on various
+tasks, but the astronomical model size raises the hardware barrier for serving
+(memory size) and slows down token generation (memory bandwidth). In this
+paper, we propose Activation-aware Weight Quantization (AWQ), a
+hardware-friendly approach for LLM low-bit weight-only quantization. Our method
+is based on the observation that weights are not equally important: protecting
+only 1% of salient weights can greatly reduce quantization error. We then
+propose to search for the optimal per-channel scaling that protects the salient
+weights by observing the activation, not weights. AWQ does not rely on any
+backpropagation or reconstruction, so it can well preserve LLMs' generalization
+ability on different domains and modalities, without overfitting to the
+calibration set. AWQ outperforms existing work on various language modeling and
+domain-specific benchmarks. Thanks to better generalization, it achieves
+excellent quantization performance for instruction-tuned LMs and, for the first
+time, multi-modal LMs. Alongside AWQ, we implement an efficient and flexible
+inference framework tailored for LLMs on the edge, offering more than 3x
+speedup over the Huggingface FP16 implementation on both desktop and mobile
+GPUs. It also democratizes the deployment of the 70B Llama-2 model on mobile
+GPU (NVIDIA Jetson Orin 64GB).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/mit-han-lab/llm-awq</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">116</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DREAM: Visual Decoding from Reversing Human Visual System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Xia, Raoul de Charette, Cengiz Öztireli, Jing-Hao Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we present DREAM, an fMRI-to-image method for reconstructing
+viewed images from brain activities, grounded on fundamental knowledge of the
+human visual system. We craft reverse pathways that emulate the hierarchical
+and parallel nature of how humans perceive the visual world. These tailored
+pathways are specialized to decipher semantics, color, and depth cues from fMRI
+data, mirroring the forward pathways from visual stimuli to fMRI recordings. To
+do so, two components mimic the inverse processes within the human visual
+system: the Reverse Visual Association Cortex (R-VAC) which reverses pathways
+of this brain region, extracting semantics from fMRI data; the Reverse Parallel
+PKM (R-PKM) component simultaneously predicting color and depth from fMRI
+signals. The experiments indicate that our method outperforms the current
+state-of-the-art models in terms of the consistency of appearance, structure,
+and semantics. Code will be made publicly available to facilitate further
+research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://weihaox.github.io/DREAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizable Long-Horizon Manipulations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhou, Mingyu Ding, Weikun Peng, Masayoshi Tomizuka, Lin Shao, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a framework harnessing the capabilities of Large
+Language Models (LLMs) to generate primitive task conditions for generalizable
+long-horizon manipulations with novel objects and unseen tasks. These task
+conditions serve as guides for the generation and adjustment of Dynamic
+Movement Primitives (DMP) trajectories for long-horizon task execution. We
+further create a challenging robotic manipulation task suite based on Pybullet
+for long-horizon task evaluation. Extensive experiments in both simulated and
+real-world environments demonstrate the effectiveness of our framework on both
+familiar tasks involving new objects and novel but related tasks, highlighting
+the potential of LLMs in enhancing robotic system versatility and adaptability.
+Project website: https://object814.github.io/Task-Condition-With-LLM/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RSRD: A Road Surface Reconstruction <span class="highlight-title">Dataset</span> and Benchmark for Safe and
+  Comfortable Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zhao, Chenfeng Xu, Mingyu Ding, Masayoshi Tomizuka, Wei Zhan, Yintao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the growing demands for safety and comfort in
+intelligent robot systems, particularly autonomous vehicles, where road
+conditions play a pivotal role in overall driving performance. For example,
+reconstructing road surfaces helps to enhance the analysis and prediction of
+vehicle responses for motion planning and control systems. We introduce the
+Road Surface Reconstruction Dataset (RSRD), a real-world, high-resolution, and
+high-precision dataset collected with a specialized platform in diverse driving
+conditions. It covers common road types containing approximately 16,000 pairs
+of stereo images, original point clouds, and ground-truth depth/disparity maps,
+with accurate post-processing pipelines to ensure its quality. Based on RSRD,
+we further build a comprehensive benchmark for recovering road profiles through
+depth estimation and stereo matching. Preliminary evaluations with various
+state-of-the-art methods reveal the effectiveness of our dataset and the
+challenge of the task, underscoring substantial opportunities of RSRD as a
+valuable resource for advancing techniques, e.g., multi-view stereo towards
+safe autonomous driving. The dataset and demo videos are available at
+https://thu-rsxd.com/rsrd/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransRadar: Adaptive-Directional <span class="highlight-title">Transformer</span> for Real-Time Multi-View
+  Radar Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yahia Dalbah, Jean Lahoud, Hisham Cholakkal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene understanding plays an essential role in enabling autonomous driving
+and maintaining high standards of performance and safety. To address this task,
+cameras and laser scanners (LiDARs) have been the most commonly used sensors,
+with radars being less popular. Despite that, radars remain low-cost,
+information-dense, and fast-sensing techniques that are resistant to adverse
+weather conditions. While multiple works have been previously presented for
+radar-based scene semantic segmentation, the nature of the radar data still
+poses a challenge due to the inherent noise and sparsity, as well as the
+disproportionate foreground and background. In this work, we propose a novel
+approach to the semantic segmentation of radar scenes using a multi-input
+fusion of radar data through a novel architecture and loss functions that are
+tailored to tackle the drawbacks of radar perception. Our novel architecture
+includes an efficient attention block that adaptively captures important
+feature information. Our method, TransRadar, outperforms state-of-the-art
+methods on the CARRADA and RADIal datasets while having smaller model sizes.
+https://github.com/YahiDar/TransRadar
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathVista: Evaluating Mathematical Reasoning of Foundation Models in
+  Visual Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Large Language Models (LLMs) and Large Multimodal Models (LMMs)
+exhibit impressive skills in various domains, their ability for mathematical
+reasoning within visual contexts has not been formally examined. Equipping LLMs
+and LMMs with this capability is vital for general-purpose AI assistants and
+showcases promising potential in education, data analysis, and scientific
+discovery. To bridge this gap, we present MathVista, a benchmark designed to
+amalgamate challenges from diverse mathematical and visual tasks. We first
+taxonomize the key task types, reasoning skills, and visual contexts from the
+literature to guide our selection from 28 existing math-focused and visual
+question answering datasets. Then, we construct three new datasets, IQTest,
+FunctionQA, and PaperQA, to accommodate for missing types of visual contexts.
+The problems featured often require deep visual understanding beyond OCR or
+image captioning, and compositional reasoning with rich domain-specific tools,
+thus posing a notable challenge to existing models. We conduct a comprehensive
+evaluation of 11 prominent open-source and proprietary foundation models (LLMs,
+LLMs augmented with tools, and LMMs), and early experiments with GPT-4V. The
+best-performing model, Multimodal Bard, achieves only 58% of human performance
+(34.8% vs 60.3%), indicating ample room for further improvement. Given this
+significant gap, MathVista fuels future research in the development of
+general-purpose AI agents capable of tackling mathematically intensive and
+visually rich real-world tasks. Preliminary tests show that MathVista also
+presents challenges to GPT-4V, underscoring the benchmark's importance. The
+project is available at https://mathvista.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 56 figures. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Talk2BEV: Language-enhanced Bird's-eye View Maps for Autonomous Driving <span class="chip">ICRA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vikrant Dewangan, Tushar Choudhary, Shivam Chandhok, Shubham Priyadarshan, Anushka Jain, Arun K. Singh, Siddharth Srivastava, Krishna Murthy Jatavallabhula, K. Madhava Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Talk2BEV is a large vision-language model (LVLM) interface for bird's-eye
+view (BEV) maps in autonomous driving contexts. While existing perception
+systems for autonomous driving scenarios have largely focused on a pre-defined
+(closed) set of object categories and driving scenarios, Talk2BEV blends recent
+advances in general-purpose language and vision models with BEV-structured map
+representations, eliminating the need for task-specific models. This enables a
+single system to cater to a variety of autonomous driving tasks encompassing
+visual and spatial reasoning, predicting the intents of traffic actors, and
+decision-making based on visual cues. We extensively evaluate Talk2BEV on a
+large number of scene understanding tasks that rely on both the ability to
+interpret free-form natural language queries, and in grounding these queries to
+the visual context embedded into the language-enhanced BEV map. To enable
+further research in LVLMs for autonomous driving scenarios, we develop and
+release Talk2BEV-Bench, a benchmark encompassing 1000 human-annotated BEV
+scenarios, with more than 20,000 questions and ground-truth responses from the
+NuScenes dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICRA 2024. Project page at
+  https://llmbev.github.io/talk2bev/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Generation of Human-Object Interactions with Diffusion
+  Probabilistic Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaijin Pi, Sida Peng, Minghui Yang, Xiaowei Zhou, Hujun Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to generating the 3D motion of a human
+interacting with a target object, with a focus on solving the challenge of
+synthesizing long-range and diverse motions, which could not be fulfilled by
+existing auto-regressive models or path planning-based methods. We propose a
+hierarchical generation framework to solve this challenge. Specifically, our
+framework first generates a set of milestones and then synthesizes the motion
+along them. Therefore, the long-range motion generation could be reduced to
+synthesizing several short motion sequences guided by milestones. The
+experiments on the NSM, COUCH, and SAMP datasets show that our approach
+outperforms previous methods by a large margin in both quality and diversity.
+The source code is available on our project page
+https://zju3dv.github.io/hghoi.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project page: https://zju3dv.github.io/hghoi</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini<span class="highlight-title">GPT</span>-5: Interleaved Vision-and-Language Generation via Generative
+  Vokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaizhi Zheng, Xuehai He, Xin Eric Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have garnered significant attention for their
+advancements in natural language processing, demonstrating unparalleled prowess
+in text comprehension and generation. Yet, the simultaneous generation of
+images with coherent textual narratives remains an evolving frontier. In
+response, we introduce an innovative interleaved vision-and-language generation
+technique anchored by the concept of "generative vokens," acting as the bridge
+for harmonized image-text outputs. Our approach is characterized by a
+distinctive two-staged training strategy focusing on description-free
+multimodal generation, where the training requires no comprehensive
+descriptions of images. To bolster model integrity, classifier-free guidance is
+incorporated, enhancing the effectiveness of vokens on image generation. Our
+model, MiniGPT-5, exhibits substantial improvement over the baseline Divter
+model on the MMDialog dataset and consistently delivers superior or comparable
+multimodal outputs in human evaluations on the VIST dataset, highlighting its
+efficacy across diverse benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Model Learning Heterogeneity for Boosting Ensemble Robustness <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzhao Wu, Ka-Ho Chow, Wenqi Wei, Ling Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network ensembles hold the potential of improving generalization
+performance for complex learning tasks. This paper presents formal analysis and
+empirical evaluation to show that heterogeneous deep ensembles with high
+ensemble diversity can effectively leverage model learning heterogeneity to
+boost ensemble robustness. We first show that heterogeneous DNN models trained
+for solving the same learning problem, e.g., object detection, can
+significantly strengthen the mean average precision (mAP) through our weighted
+bounding box ensemble consensus method. Second, we further compose ensembles of
+heterogeneous models for solving different learning problems, e.g., object
+detection and semantic segmentation, by introducing the connected component
+labeling (CCL) based alignment. We show that this two-tier heterogeneity driven
+ensemble construction method can compose an ensemble team that promotes high
+ensemble diversity and low negative correlation among member models of the
+ensemble, strengthening ensemble robustness against both negative examples and
+adversarial attacks. Third, we provide a formal analysis of the ensemble
+robustness in terms of negative correlation. Extensive experiments validate the
+enhanced robustness of heterogeneous ensembles in both benign and adversarial
+settings. The source codes are available on GitHub at
+https://github.com/git-disl/HeteRobust.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ICDM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIS-AVioDD: Modality Invariant and Specific Representation for
+  Audio-Visual Deepfake Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinaya Sree Katamneni, Ajita Rattani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfakes are synthetic media generated using deep generative algorithms and
+have posed a severe societal and political threat. Apart from facial
+manipulation and synthetic voice, recently, a novel kind of deepfakes has
+emerged with either audio or visual modalities manipulated. In this regard, a
+new generation of multimodal audio-visual deepfake detectors is being
+investigated to collectively focus on audio and visual data for multimodal
+manipulation detection. Existing multimodal (audio-visual) deepfake detectors
+are often based on the fusion of the audio and visual streams from the video.
+Existing studies suggest that these multimodal detectors often obtain
+equivalent performances with unimodal audio and visual deepfake detectors. We
+conjecture that the heterogeneous nature of the audio and visual signals
+creates distributional modality gaps and poses a significant challenge to
+effective fusion and efficient performance. In this paper, we tackle the
+problem at the representation level to aid the fusion of audio and visual
+streams for multimodal deepfake detection. Specifically, we propose the joint
+use of modality (audio and visual) invariant and specific representations. This
+ensures that the common patterns and patterns specific to each modality
+representing pristine or fake content are preserved and fused for multimodal
+deepfake manipulation detection. Our experimental results on FakeAVCeleb and
+KoDF audio-visual deepfake datasets suggest the enhanced accuracy of our
+proposed method over SOTA unimodal and multimodal audio-visual deepfake
+detectors by $17.8$% and $18.4$%, respectively. Thus, obtaining
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Leveraging Diffusion Disentangled Representations to Mitigate Shortcuts
+  in Underspecified Visual Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Scimeca, Alexander Rubinstein, Armand Nicolicioiu, Damien Teney, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spurious correlations in the data, where multiple cues are predictive of the
+target labels, often lead to shortcut learning phenomena, where a model may
+rely on erroneous, easy-to-learn, cues while ignoring reliable ones. In this
+work, we propose an ensemble diversification framework exploiting the
+generation of synthetic counterfactuals using Diffusion Probabilistic Models
+(DPMs). We discover that DPMs have the inherent capability to represent
+multiple visual cues independently, even when they are largely correlated in
+the training data. We leverage this characteristic to encourage model diversity
+and empirically show the efficacy of the approach with respect to several
+diversification objectives. We show that diffusion-guided diversification can
+lead models to avert attention from shortcut cues, achieving ensemble diversity
+performance comparable to previous methods requiring additional data
+collection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What do we learn from a large-scale study of <span class="highlight-title">pre-train</span>ed visual
+  representations in sim and real environments? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneha Silwal, Karmesh Yadav, Tingfan Wu, Jay Vakil, Arjun Majumdar, Sergio Arnaud, Claire Chen, Vincent-Pierre Berges, Dhruv Batra, Aravind Rajeswaran, Mrinal Kalakrishnan, Franziska Meier, Oleksandr Maksymets
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a large empirical investigation on the use of pre-trained visual
+representations (PVRs) for training downstream policies that execute real-world
+tasks. Our study spans five different PVRs, two different policy-learning
+paradigms (imitation and reinforcement learning), and three different robots
+for 5 distinct manipulation and indoor navigation tasks. From this effort, we
+can arrive at three insights: 1) the performance trends of PVRs in the
+simulation are generally indicative of their trends in the real world, 2) the
+use of PVRs enables a first-of-its-kind result with indoor ImageNav (zero-shot
+transfer to a held-out scene in the real world), and 3) the benefits from
+variations in PVRs, primarily data-augmentation and fine-tuning, also transfer
+to the real-world performance. See project website for additional details and
+visuals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website https://pvrs-sim2real.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learnable Data Augmentation for One-Shot Unsupervised Domain Adaptation <span class="chip">BMVC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio Ivan Davila Carrazco, Pietro Morerio, Alessio Del Bue, Vittorio Murino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a classification framework based on learnable data
+augmentation to tackle the One-Shot Unsupervised Domain Adaptation (OS-UDA)
+problem. OS-UDA is the most challenging setting in Domain Adaptation, as only
+one single unlabeled target sample is assumed to be available for model
+adaptation. Driven by such single sample, our method LearnAug-UDA learns how to
+augment source data, making it perceptually similar to the target. As a result,
+a classifier trained on such augmented data will generalize well for the target
+domain. To achieve this, we designed an encoder-decoder architecture that
+exploits a perceptual loss and style transfer strategies to augment the source
+data. Our method achieves state-of-the-art performance on two well-known Domain
+Adaptation benchmarks, DomainNet and VisDA. The project code is available at
+https://github.com/IIT-PAVIS/LearnAug-UDA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to The 34th British Machine Vision Conference (BMVC 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAD-Phys: Exploiting Physiology for Presentation Attack Detection in
+  Face Biometrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis F. Gomez, Julian Fierrez, Aythami Morales, Mahdi Ghafourian, Ruben Tolosana, Imanol Solano, Alejandro Garcia, Francisco Zamora-Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Presentation Attack Detection (PAD) is a crucial stage in facial recognition
+systems to avoid leakage of personal information or spoofing of identity to
+entities. Recently, pulse detection based on remote photoplethysmography (rPPG)
+has been shown to be effective in face presentation attack detection.
+  This work presents three different approaches to the presentation attack
+detection based on rPPG: (i) The physiological domain, a domain using
+rPPG-based models, (ii) the Deepfakes domain, a domain where models were
+retrained from the physiological domain to specific Deepfakes detection tasks;
+and (iii) a new Presentation Attack domain was trained by applying transfer
+learning from the two previous domains to improve the capability to
+differentiate between bona-fides and attacks.
+  The results show the efficiency of the rPPG-based models for presentation
+attack detection, evidencing a 21.70% decrease in average classification error
+rate (ACER) (from 41.03% to 19.32%) when the presentation attack domain is
+compared to the physiological and Deepfakes domains. Our experiments highlight
+the efficiency of transfer learning in rPPG-based models and perform well in
+presentation attack detection in instruments that do not allow copying of this
+physiological feature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper presented to the Workshop on IEEE 47th Annual
+  Computers, Software, and Applications Conference (COMPSAC, 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhoubo Li, Ningyu Zhang, Yunzhi Yao, Mengru Wang, Xi Chen, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the cost associated with fine-tuning Large Language Models (LLMs)
+continues to rise, recent research efforts have pivoted towards developing
+methodologies to edit implicit knowledge embedded within LLMs. Yet, there's
+still a dark cloud lingering overhead -- will knowledge editing trigger
+butterfly effect? since it is still unclear whether knowledge editing might
+introduce side effects that pose potential risks or not. This paper pioneers
+the investigation into the potential pitfalls associated with knowledge editing
+for LLMs. To achieve this, we introduce new benchmark datasets and propose
+innovative evaluation metrics. Our results underline two pivotal concerns: (1)
+Knowledge Conflict: Editing groups of facts that logically clash can magnify
+the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)
+Knowledge Distortion: Altering parameters with the aim of editing factual
+knowledge can irrevocably warp the innate knowledge structure of LLMs.
+Experimental results vividly demonstrate that knowledge editing might
+inadvertently cast a shadow of unintended consequences on LLMs, which warrant
+attention and efforts for future works. Code will be released at
+https://github.com/zjunlp/PitfallsKnowledgeEditing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SIEVE: Multimodal <span class="highlight-title">Dataset</span> Pruning Using Image Captioning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Mahmoud, Mostafa Elhoushi, Amro Abbas, Yu Yang, Newsha Ardalani, Hugh Leather, Ari Morcos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models (VLMs) are pretrained on large, diverse, and noisy
+web-crawled datasets. This underscores the critical need for dataset pruning,
+as the quality of these datasets is strongly correlated with the performance of
+VLMs on downstream tasks. Using CLIPScore from a pretrained model to only train
+models using highly-aligned samples is one of the most successful methods for
+pruning.We argue that this approach suffers from multiple limitations
+including: 1) false positives due to spurious correlations captured by the
+pretrained CLIP model, 2) false negatives due to poor discrimination between
+hard and bad samples, and 3) biased ranking towards samples similar to the
+pretrained CLIP dataset. We propose a pruning method, SIEVE, that employs
+synthetic captions generated by image-captioning models pretrained on small,
+diverse, and well-aligned image-text pairs to evaluate the alignment of noisy
+image-text pairs. To bridge the gap between the limited diversity of generated
+captions and the high diversity of alternative text (alt-text), we estimate the
+semantic textual similarity in the embedding space of a language model
+pretrained on billions of sentences. Using DataComp, a multimodal dataset
+filtering benchmark, we achieve state-of-the-art performance on the large scale
+pool, and competitive results on the medium scale pool, surpassing
+CLIPScore-based filtering by 1.7% and 2.6% on average, on 38 downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Classic Deconvolution and Feature Extraction in Zero-Shot
+  Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomáš Chobola, Gesine Müller, Veit Dausmann, Anton Theileis, Jan Taucher, Jan Huisken, Tingying Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-blind deconvolution aims to restore a sharp image from its blurred
+counterpart given an obtained kernel. Existing deep neural architectures are
+often built based on large datasets of sharp ground truth images and trained
+with supervision. Sharp, high quality ground truth images, however, are not
+always available, especially for biomedical applications. This severely hampers
+the applicability of current approaches in practice. In this paper, we propose
+a novel non-blind deconvolution method that leverages the power of deep
+learning and classic iterative deconvolution algorithms. Our approach combines
+a pre-trained network to extract deep features from the input image with
+iterative Richardson-Lucy deconvolution steps. Subsequently, a zero-shot
+optimisation process is employed to integrate the deconvolved features,
+resulting in a high-quality reconstructed image. By performing the preliminary
+reconstruction with the classic iterative deconvolution method, we can
+effectively utilise a smaller network to produce the final image, thus
+accelerating the reconstruction whilst reducing the demand for valuable
+computational resources. Our method demonstrates significant improvements in
+various real-world applications non-blind deconvolution tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point Neighborhood Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Hermosilla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point convolution operations rely on different embedding mechanisms to encode
+the neighborhood information of each point in order to detect patterns in 3D
+space. However, as convolutions are usually evaluated as a whole, not much work
+has been done to investigate which is the ideal mechanism to encode such
+neighborhood information. In this paper, we provide the first extensive study
+that analyzes such Point Neighborhood Embeddings (PNE) alone in a controlled
+experimental setup. From our experiments, we derive a set of recommendations
+for PNE that can help to improve future designs of neural network architectures
+for point clouds. Our most surprising finding shows that the most commonly used
+embedding based on a Multi-layer Perceptron (MLP) with ReLU activation
+functions provides the lowest performance among all embeddings, even being
+surpassed on some tasks by a simple linear combination of the point
+coordinates. Additionally, we show that a neural network architecture using
+simple convolutions based on such embeddings is able to achieve
+state-of-the-art results on several tasks, outperforming recent and more
+complex operations. Lastly, we show that these findings extrapolate to other
+more complex convolution operations, where we show how following our
+recommendations we are able to improve recent state-of-the-art architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards End-to-End Embodied Decision Making via Multi-modal Large
+  Language Model: Explorations with <span class="highlight-title">GPT</span>4-Vision and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Yichi Zhang, Shuhuai Ren, Haozhe Zhao, Zefan Cai, Yuchi Wang, Tianyu Liu, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore the potential of Multimodal Large Language Models
+(MLLMs) in improving embodied decision-making processes for agents. While Large
+Language Models (LLMs) have been widely used due to their advanced reasoning
+skills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual
+understanding and reasoning capabilities. We investigate whether
+state-of-the-art MLLMs can handle embodied decision-making in an end-to-end
+manner and whether collaborations between LLMs and MLLMs can enhance
+decision-making. To address these questions, we introduce a new benchmark
+called PCA-EVAL, which evaluates embodied decision-making from the perspectives
+of Perception, Cognition, and Action. Additionally, we propose HOLMES, a
+multi-agent cooperation framework that allows LLMs to leverage MLLMs and APIs
+to gather multimodal information for informed decision-making. We compare
+end-to-end embodied decision-making and HOLMES on our benchmark and find that
+the GPT4-Vision model demonstrates strong end-to-end embodied decision-making
+abilities, outperforming GPT4-HOLMES in terms of average decision accuracy
+(+3%). However, this performance is exclusive to the latest GPT4-Vision model,
+surpassing the open-source state-of-the-art MLLM by 26%. Our results indicate
+that powerful MLLMs like GPT4-Vision hold promise for decision-making in
+embodied agents, offering new avenues for MLLM research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Content Bias in Deep Learning Age Approximation: A new Approach Towards
+  more Explainability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Jöchl, Andreas Uhl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of temporal image forensics, it is not evident that a neural
+network, trained on images from different time-slots (classes), exploit solely
+age related features. Usually, images taken in close temporal proximity (e.g.,
+belonging to the same age class) share some common content properties. Such
+content bias can be exploited by a neural network. In this work, a novel
+approach that evaluates the influence of image content is proposed. This
+approach is verified using synthetic images (where content bias can be ruled
+out) with an age signal embedded. Based on the proposed approach, it is shown
+that a `standard' neural network trained in the context of age classification
+is strongly dependent on image content. As a potential countermeasure, two
+different techniques are applied to mitigate the influence of the image content
+during training, and they are also evaluated by the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preprint, the paper is currently under consideration at
+  Pattern Recognition Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Attractor for a Reaction-Diffusion Model Arising in Biological
+  Dynamic in 3D Soil Structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Elghandouri, Khalil Ezzinbi, Mouad Klai, Olivier Monga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Partial Differential Equations (PDEs) play a crucial role as tools for
+modeling and comprehending intricate natural processes, notably within the
+domain of biology. This research explores the domain of microbial activity
+within the complex matrix of 3D soil structures, providing valuable
+understanding into both the existence and uniqueness of solutions and the
+asymptotic behavior of the corresponding PDE model. Our investigation results
+in the discovery of a global attractor, a fundamental feature with significant
+implications for long-term system behavior. To enhance the clarity of our
+findings, numerical simulations are employed to visually illustrate the
+attributes of this global attractor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Mathematical Geosciences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tuning Large language model for End-to-end Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhang, Nianwen Si, Yaqi Chen, Wenlin Zhang, Xukui Yang, Dan Qu, Xiaolin Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of large language models (LLMs), multimodal models based
+on LLMs have demonstrated significant potential. Models such as LLaSM, X-LLM,
+and SpeechGPT exhibit an impressive ability to comprehend and generate human
+instructions. However, their performance often falters when faced with complex
+tasks like end-to-end speech translation (E2E-ST), a cross-language and
+cross-modal translation task. In comparison to single-modal models, multimodal
+models lag behind in these scenarios. This paper introduces LST, a Large
+multimodal model designed to excel at the E2E-ST task. LST consists of a speech
+frontend, an adapter, and a LLM backend. The training of LST consists of two
+stages: (1) Modality adjustment, where the adapter is tuned to align speech
+representation with text embedding space, and (2) Downstream task fine-tuning,
+where both the adapter and LLM model are trained to optimize performance on the
+E2EST task. Experimental results on the MuST-C speech translation benchmark
+demonstrate that LST-13B achieves BLEU scores of 30.39/41.55/35.33 on
+En-De/En-Fr/En-Es language pairs, surpassing previous models and establishing a
+new state-of-the-art. Additionally, we conduct an in-depth analysis of
+single-modal model selection and the impact of training strategies, which lays
+the foundation for future research. We will open up our code and models after
+review.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Generalisability of Self-Distillation with No Labels for
+  SAR-Based Vegetation Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Martínez-Ferrer, Anna Jungbluth, Joseph A. Gallego-Mejia, Matt Allen, Francisco Dorr, Freddie Kalaitzis, Raúl Ramos-Pollán
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we pre-train a DINO-ViT based model using two Synthetic Aperture
+Radar datasets (S1GRD or GSSIC) across three regions (China, Conus, Europe). We
+fine-tune the models on smaller labeled datasets to predict vegetation
+percentage, and empirically study the connection between the embedding space of
+the models and their ability to generalize across diverse geographic regions
+and to unseen data. For S1GRD, embedding spaces of different regions are
+clearly separated, while GSSIC's overlaps. Positional patterns remain during
+fine-tuning, and greater distances in embeddings often result in higher errors
+for unfamiliar regions. With this, our work increases our understanding of
+generalizability for self-supervised models applied to remote sensing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video <span class="highlight-title">Transformer</span>s under Occlusion: How Physics and Background
+  Attributes Impact Large Models for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shutong Jin, Ruiyu Wang, Muhammad Zahid, Florian T. Pokorny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As transformer architectures and dataset sizes continue to scale, the need to
+understand the specific dataset factors affecting model performance becomes
+increasingly urgent. This paper investigates how object physics attributes
+(color, friction coefficient, shape) and background characteristics (static,
+dynamic, background complexity) influence the performance of Video Transformers
+in trajectory prediction tasks under occlusion. Beyond mere occlusion
+challenges, this study aims to investigate three questions: How do object
+physics attributes and background characteristics influence the model
+performance? What kinds of attributes are most influential to the model
+generalization? Is there a data saturation point for large transformer model
+performance within a single task? To facilitate this research, we present
+OccluManip, a real-world video-based robot pushing dataset comprising 460,000
+consistent recordings of objects with different physics and varying
+backgrounds. 1.4 TB and in total 1278 hours of high-quality videos of flexible
+temporal length along with target object trajectories are collected,
+accommodating tasks with different temporal requirements. Additionally, we
+propose Video Occlusion Transformer (VOT), a generic video-transformer-based
+network achieving an average 96% accuracy across all 18 sub-datasets provided
+in OccluManip. OccluManip and VOT will be released at:
+https://github.com/ShutongJIN/OccluManip.git
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An evaluation of <span class="highlight-title">pre-train</span>ed models for feature extraction in image
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erick da Silva Puls, Matheus V. Todescato, Joel L. Carbonera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, we have witnessed a considerable increase in performance in
+image classification tasks. This performance improvement is mainly due to the
+adoption of deep learning techniques. Generally, deep learning techniques
+demand a large set of annotated data, making it a challenge when applying it to
+small datasets. In this scenario, transfer learning strategies have become a
+promising alternative to overcome these issues. This work aims to compare the
+performance of different pre-trained neural networks for feature extraction in
+image classification tasks. We evaluated 16 different pre-trained models in
+four image datasets. Our results demonstrate that the best general performance
+along the datasets was achieved by CLIP-ViT-B and ViT-H-14, where the
+CLIP-ResNet50 model had similar performance but with less variability.
+Therefore, our study provides evidence supporting the choice of models for
+feature extraction in image classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Human Activities: Analyzing Wearable Accelerometer and
+  Gyroscope Data for Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Utsab Saha, Sawradip Saha, Tahmid Kabir, Shaikh Anowarul Fattah, Mohammad Saquib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A person's movement or relative positioning effectively generates raw
+electrical signals that can be read by computing machines to apply various
+manipulative techniques for the classification of different human activities.
+In this paper, a stratified multi-structural approach based on a Residual
+network ensembled with Residual MobileNet is proposed, termed as FusionActNet.
+The proposed method involves using carefully designed Residual blocks for
+classifying the static and dynamic activities separately because they have
+clear and distinct characteristics that set them apart. These networks are
+trained independently, resulting in two specialized and highly accurate models.
+These models excel at recognizing activities within a specific superclass by
+taking advantage of the unique algorithmic benefits of architectural
+adjustments. Afterward, these two ResNets are passed through a weighted
+ensemble-based Residual MobileNet. Subsequently, this ensemble proficiently
+discriminates between a specific static and a specific dynamic activity, which
+were previously identified based on their distinct feature characteristics in
+the earlier stage. The proposed model is evaluated using two publicly
+accessible datasets; namely, UCI HAR and Motion-Sense. Therein, it successfully
+handled the highly confusing cases of data overlap. Therefore, the proposed
+approach achieves a state-of-the-art accuracy of 96.71% and 95.35% in the UCI
+HAR and Motion-Sense datasets respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MUSCLE: Multi-task <span class="highlight-title">Self-supervised</span> Continual Learning to <span class="highlight-title">Pre-train</span> Deep
+  Models for X-ray Images of Multiple Body Parts <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weibin Liao, Haoyi Xiong, Qingzhong Wang, Yan Mo, Xuhong Li, Yi Liu, Zeyu Chen, Siyu Huang, Dejing Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While self-supervised learning (SSL) algorithms have been widely used to
+pre-train deep models, few efforts [11] have been done to improve
+representation learning of X-ray image analysis with SSL pre-trained models. In
+this work, we study a novel self-supervised pre-training pipeline, namely
+Multi-task Self-super-vised Continual Learning (MUSCLE), for multiple medical
+imaging tasks, such as classification and segmentation, using X-ray images
+collected from multiple body parts, including heads, lungs, and bones.
+Specifically, MUSCLE aggregates X-rays collected from multiple body parts for
+MoCo-based representation learning, and adopts a well-designed continual
+learning (CL) procedure to further pre-train the backbone subject various X-ray
+analysis tasks jointly. Certain strategies for image pre-processing, learning
+schedules, and regularization have been used to solve data heterogeneity,
+overfitting, and catastrophic forgetting problems for multi-task/dataset
+learning in MUSCLE.We evaluate MUSCLE using 9 real-world X-ray datasets with
+various tasks, including pneumonia classification, skeletal abnormality
+classification, lung segmentation, and tuberculosis (TB) detection. Comparisons
+against other pre-trained models [7] confirm the proof-of-concept that
+self-supervised multi-task/dataset continual pre-training could boost the
+performance of X-ray image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by Medical Image Computing and Computer Assisted
+  Intervention (MICCAI) 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Masked Autoencoders From a Local Contrastive Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Yue, Lei Bai, Meng Wei, Jiangmiao Pang, Xihui Liu, Luping Zhou, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked AutoEncoder(MAE) has revolutionized the field of self-supervised
+learning with its simple yet effective masking and reconstruction strategies.
+However, despite achieving state-of-the-art performance across various
+downstream vision tasks, the underlying mechanisms that drive MAE's efficacy
+are less well-explored compared to the canonical contrastive learning paradigm.
+In this paper, we explore a new perspective to explain what truly contributes
+to the "rich hidden representations inside the MAE". Firstly, concerning MAE's
+generative pretraining pathway, with a unique encoder-decoder architecture to
+reconstruct images from aggressive masking, we conduct an in-depth analysis of
+the decoder's behaviors. We empirically find that MAE's decoder mainly learns
+local features with a limited receptive field, adhering to the well-known
+Locality Principle. Building upon this locality assumption, we propose a
+theoretical framework that reformulates the reconstruction-based MAE into a
+local region-level contrastive learning form for improved understanding.
+Furthermore, to substantiate the local contrastive nature of MAE, we introduce
+a Siamese architecture that combines the essence of MAE and contrastive
+learning without masking and explicit decoder, which sheds light on a unified
+and more flexible self-supervised learning framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of Machine Vision Approach for Mechanical Component
+  Identification based on its Dimension and Pitch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toshit Jain, Faisel Mushtaq, K Ramesh, Sandip Deshmukh, Tathagata Ray, Chandu Parimi, Praveen Tandon, Pramod Kumar Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, a highly customizable and scalable vision based system for
+automation of mechanical assembly lines is described. The proposed system
+calculates the features that are required to classify and identify the
+different kinds of bolts that are used in the assembly line. The system
+describes a novel method of calculating the pitch of the bolt in addition to
+bolt identification and calculating the dimensions of the bolts. This
+identification and classification system is extremely lightweight and can be
+run on bare minimum hardware. The system is very fast in the order of
+milliseconds, hence the system can be used successfully even if the components
+are steadily moving on a conveyor. The results show that our system can
+correctly identify the parts in our dataset with 98% accuracy using the
+calculated features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Driving with LLMs: Fusing Object-Level Vector Modality for Explainable
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Chen, Oleg Sinavski, Jan Hünermann, Alice Karnsund, Andrew James Willmott, Danny Birch, Daniel Maund, Jamie Shotton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promise in the autonomous driving
+sector, particularly in generalization and interpretability. We introduce a
+unique object-level multimodal LLM architecture that merges vectorized numeric
+modalities with a pre-trained LLM to improve context understanding in driving
+situations. We also present a new dataset of 160k QA pairs derived from 10k
+driving scenarios, paired with high quality control commands collected with RL
+agent and question answer pairs generated by teacher LLM (GPT-3.5). A distinct
+pretraining strategy is devised to align numeric vector modalities with static
+LLM representations using vector captioning language data. We also introduce an
+evaluation metric for Driving QA and demonstrate our LLM-driver's proficiency
+in interpreting driving scenarios, answering questions, and decision-making.
+Our findings highlight the potential of LLM-based driving action generation in
+comparison to traditional behavioral cloning. We make our benchmark, datasets,
+and model available for further exploration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoralVOS: <span class="highlight-title">Dataset</span> and Benchmark for Coral Video Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Ziqiang, Xie Yaofeng, Liang Haixin, Yu Zhibin, Sai-Kit Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coral reefs formulate the most valuable and productive marine ecosystems,
+providing habitat for many marine species. Coral reef surveying and analysis
+are currently confined to coral experts who invest substantial effort in
+generating comprehensive and dependable reports (\emph{e.g.}, coral coverage,
+population, spatial distribution, \textit{etc}), from the collected survey
+data. However, performing dense coral analysis based on manual efforts is
+significantly time-consuming, the existing coral analysis algorithms compromise
+and opt for performing down-sampling and only conducting sparse point-based
+coral analysis within selected frames. However, such down-sampling will
+\textbf{inevitable} introduce the estimation bias or even lead to wrong
+results. To address this issue, we propose to perform \textbf{dense coral video
+segmentation}, with no down-sampling involved. Through video object
+segmentation, we could generate more \textit{reliable} and \textit{in-depth}
+coral analysis than the existing coral reef analysis algorithms. To boost such
+dense coral analysis, we propose a large-scale coral video segmentation
+dataset: \textbf{CoralVOS} as demonstrated in Fig. 1. To the best of our
+knowledge, our CoralVOS is the first dataset and benchmark supporting dense
+coral video segmentation. We perform experiments on our CoralVOS dataset,
+including 6 recent state-of-the-art video object segmentation (VOS) algorithms.
+We fine-tuned these VOS algorithms on our CoralVOS dataset and achieved
+observable performance improvement. The results show that there is still great
+potential for further promoting the segmentation accuracy. The dataset and
+trained models will be released with the acceptance of this work to foster the
+coral reef research community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures, dense coral video segmentation dataset and
+  benchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OOD Aware Supervised Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Seifi, Daniel Olmeda Reino, Nikolay Chumerin, Rahaf Aljundi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-Distribution (OOD) detection is a crucial problem for the safe
+deployment of machine learning models identifying samples that fall outside of
+the training distribution, i.e. in-distribution data (ID). Most OOD works focus
+on the classification models trained with Cross Entropy (CE) and attempt to fix
+its inherent issues. In this work we leverage powerful representation learned
+with Supervised Contrastive (SupCon) training and propose a holistic approach
+to learn a classifier robust to OOD data. We extend SupCon loss with two
+additional contrast terms. The first term pushes auxiliary OOD representations
+away from ID representations without imposing any constraints on similarities
+among auxiliary data. The second term pushes OOD features far from the existing
+class prototypes, while pushing ID representations closer to their
+corresponding class prototype. When auxiliary OOD data is not available, we
+propose feature mixing techniques to efficiently generate pseudo-OOD features.
+Our solution is simple and efficient and acts as a natural extension of the
+closed-set supervised contrastive representation learning. We compare against
+different OOD detection methods on the common benchmarks and show
+state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Image-Text Pair <span class="highlight-title">Dataset</span> from Books <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yamato Okamoto, Haruto Toyonaga, Yoshihisa Ijiri, Hirokatsu Kataoka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital archiving is becoming widespread owing to its effectiveness in
+protecting valuable books and providing knowledge to many people
+electronically. In this paper, we propose a novel approach to leverage digital
+archives for machine learning. If we can fully utilize such digitized data,
+machine learning has the potential to uncover unknown insights and ultimately
+acquire knowledge autonomously, just like humans read books. As a first step,
+we design a dataset construction pipeline comprising an optical character
+reader (OCR), an object detector, and a layout analyzer for the autonomous
+extraction of image-text pairs. In our experiments, we apply our pipeline on
+old photo books to construct an image-text pair dataset, showing its
+effectiveness in image-text retrieval and insight extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023 workshop, Towards the Next Generation of
+  Computer Vision Datasets: General DataCentric Submission Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust deformable image registration using cycle-consistent implicit
+  representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis D. van Harten, Jaap Stoker, Ivana Išgum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works in medical image registration have proposed the use of Implicit
+Neural Representations, demonstrating performance that rivals state-of-the-art
+learning-based methods. However, these implicit representations need to be
+optimized for each new image pair, which is a stochastic process that may fail
+to converge to a global minimum. To improve robustness, we propose a deformable
+registration method using pairs of cycle-consistent Implicit Neural
+Representations: each implicit representation is linked to a second implicit
+representation that estimates the opposite transformation, causing each network
+to act as a regularizer for its paired opposite. During inference, we generate
+multiple deformation estimates by numerically inverting the paired backward
+transformation and evaluating the consensus of the optimized pair. This
+consensus improves registration accuracy over using a single representation and
+results in a robust uncertainty metric that can be used for automatic quality
+control. We evaluate our method with a 4D lung CT dataset. The proposed
+cycle-consistent optimization method reduces the optimization failure rate from
+2.4% to 0.0% compared to the current state-of-the-art. The proposed inference
+method improves landmark accuracy by 4.5% and the proposed uncertainty metric
+detects all instances where the registration method fails to converge to a
+correct solution. We verify the generalizability of these results to other data
+using a centerline propagation task in abdominal 4D MRI, where our method
+achieves a 46% improvement in propagation consistency compared with single-INR
+registration and demonstrates a strong correlation between the proposed
+uncertainty metric and registration accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, accepted in IEEE Transactions on Medical Imaging</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MarineDet: Towards Open-Marine Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Haixin, Zheng Ziqiang, Ma Zeyu, Sai-Kit Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Marine object detection has gained prominence in marine research, driven by
+the pressing need to unravel oceanic mysteries and enhance our understanding of
+invaluable marine ecosystems. There is a profound requirement to efficiently
+and accurately identify and localize diverse and unseen marine entities within
+underwater imagery. The open-marine object detection (OMOD for short) is
+required to detect diverse and unseen marine objects, performing categorization
+and localization simultaneously. To achieve OMOD, we present
+\textbf{MarineDet}. We formulate a joint visual-text semantic space through
+pre-training and then perform marine-specific training to achieve
+in-air-to-marine knowledge transfer. Considering there is no specific dataset
+designed for OMOD, we construct a \textbf{MarineDet dataset} consisting of 821
+marine-relative object categories to promote and measure OMOD performance. The
+experimental results demonstrate the superior performance of MarineDet over
+existing generalist and specialist object detection algorithms. To the best of
+our knowledge, we are the first to present OMOD, which holds a more valuable
+and practical setting for marine ecosystem monitoring and management. Our
+research not only pushes the boundaries of marine understanding but also offers
+a standard pipeline for OMOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DARTH: Holistic Test-time Adaptation for Multiple Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mattia Segu, Bernt Schiele, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple object tracking (MOT) is a fundamental component of perception
+systems for autonomous driving, and its robustness to unseen conditions is a
+requirement to avoid life-critical failures. Despite the urge of safety in
+driving systems, no solution to the MOT adaptation problem to domain shift in
+test-time conditions has ever been proposed. However, the nature of a MOT
+system is manifold - requiring object detection and instance association - and
+adapting all its components is non-trivial. In this paper, we analyze the
+effect of domain shift on appearance-based trackers, and introduce DARTH, a
+holistic test-time adaptation framework for MOT. We propose a detection
+consistency formulation to adapt object detection in a self-supervised fashion,
+while adapting the instance appearance representations via our novel patch
+contrastive loss. We evaluate our method on a variety of domain shifts -
+including sim-to-real, outdoor-to-indoor, indoor-to-outdoor - and substantially
+improve the source model performance on all metrics. Code:
+https://github.com/mattiasegu/darth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the IEEE/CVF International Conference on Computer
+  Vision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoFormer for Position Aware Multiple Instance Learning in Whole Slide
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Pochet, Rami Maroun, Roger Trullo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole slide image (WSI) classification is a critical task in computational
+pathology. However, the gigapixel-size of such images remains a major challenge
+for the current state of deep-learning. Current methods rely on
+multiple-instance learning (MIL) models with frozen feature extractors. Given
+the the high number of instances in each image, MIL methods have long assumed
+independence and permutation-invariance of patches, disregarding the tissue
+structure and correlation between patches. Recent works started studying this
+correlation between instances but the computational workload of such a high
+number of tokens remained a limiting factor. In particular, relative position
+of patches remains unaddressed. We propose to apply a straightforward encoding
+module, namely a RoFormer layer , relying on memory-efficient exact
+self-attention and relative positional encoding. This module can perform full
+self-attention with relative position encoding on patches of large and
+arbitrary shaped WSIs, solving the need for correlation between instances and
+spatial modeling of tissues. We demonstrate that our method outperforms
+state-of-the-art MIL models on three commonly used public datasets (TCGA-NSCLC,
+BRACS and Camelyon16)) on weakly supervised classification tasks. Code is
+available at https://github.com/Sanofi-Public/DDS-RoFormerMIL
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Automatic Diabetic Retinopathy Severity Classification Using
+  Deep Multimodal Fusion of UWF-CFP and OCTA Images <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mostafa El Habib Daho, Yihao Li, Rachid Zeghlache, Yapo Cedric Atse, Hugo Le Boité, Sophie Bonnin, Deborah Cosette, Pierre Deman, Laurent Borderie, Capucine Lepicard, Ramin Tadayoni, Béatrice Cochener, Pierre-Henri Conze, Mathieu Lamard, Gwenolé Quellec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diabetic Retinopathy (DR), a prevalent and severe complication of diabetes,
+affects millions of individuals globally, underscoring the need for accurate
+and timely diagnosis. Recent advancements in imaging technologies, such as
+Ultra-WideField Color Fundus Photography (UWF-CFP) imaging and Optical
+Coherence Tomography Angiography (OCTA), provide opportunities for the early
+detection of DR but also pose significant challenges given the disparate nature
+of the data they produce. This study introduces a novel multimodal approach
+that leverages these imaging modalities to notably enhance DR classification.
+Our approach integrates 2D UWF-CFP images and 3D high-resolution 6x6 mm$^3$
+OCTA (both structure and flow) images using a fusion of ResNet50 and
+3D-ResNet50 models, with Squeeze-and-Excitation (SE) blocks to amplify relevant
+features. Additionally, to increase the model's generalization capabilities, a
+multimodal extension of Manifold Mixup, applied to concatenated multimodal
+features, is implemented. Experimental results demonstrate a remarkable
+enhancement in DR classification performance with the proposed multimodal
+approach compared to methods relying on a single modality only. The methodology
+laid out in this work holds substantial promise for facilitating more accurate,
+early detection of DR, potentially improving clinical outcomes for patients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted preprint for presentation at MICCAI-OMIA 20023, Vancouver,
+  Canada</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving style transfer in dynamic contrast enhanced MRI using a
+  spatio-temporal approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam G. Tattersall, Keith A. Goatman, Lucy E. Kershaw, Scott I. K. Semple, Sonia Dahdouh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Style transfer in DCE-MRI is a challenging task due to large variations in
+contrast enhancements across different tissues and time. Current unsupervised
+methods fail due to the wide variety of contrast enhancement and motion between
+the images in the series. We propose a new method that combines autoencoders to
+disentangle content and style with convolutional LSTMs to model predicted
+latent spaces along time and adaptive convolutions to tackle the localised
+nature of contrast enhancement. To evaluate our method, we propose a new metric
+that takes into account the contrast enhancement. Qualitative and quantitative
+analyses show that the proposed method outperforms the state of the art on two
+different datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond the Benchmark: Detecting Diverse Anomalies in Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoav Arad, Michael Werman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Anomaly Detection (VAD) plays a crucial role in modern surveillance
+systems, aiming to identify various anomalies in real-world situations.
+However, current benchmark datasets predominantly emphasize simple,
+single-frame anomalies such as novel object detection. This narrow focus
+restricts the advancement of VAD models. In this research, we advocate for an
+expansion of VAD investigations to encompass intricate anomalies that extend
+beyond conventional benchmark boundaries. To facilitate this, we introduce two
+datasets, HMDB-AD and HMDB-Violence, to challenge models with diverse
+action-based anomalies. These datasets are derived from the HMDB51 action
+recognition dataset. We further present Multi-Frame Anomaly Detection (MFAD), a
+novel method built upon the AI-VAD framework. AI-VAD utilizes single-frame
+features such as pose estimation and deep image encoding, and two-frame
+features such as object velocity. They then apply a density estimation
+algorithm to compute anomaly scores. To address complex multi-frame anomalies,
+we add a deep video encoding features capturing long-range temporal
+dependencies, and logistic regression to enhance final score calculation.
+Experimental results confirm our assumptions, highlighting existing models
+limitations with new anomaly types. MFAD excels in both simple and complex
+anomaly detection scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MFOS: Model-Free & One-Shot Object Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JongMin Lee, Yohann Cabon, Romain Brégier, Sungjoo Yoo, Jerome Revaud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based methods for object pose estimation in RGB images are
+mostly model-specific or category based. They lack the capability to generalize
+to new object categories at test time, hence severely hindering their
+practicability and scalability. Notably, recent attempts have been made to
+solve this issue, but they still require accurate 3D data of the object surface
+at both train and test time. In this paper, we introduce a novel approach that
+can estimate in a single forward pass the pose of objects never seen during
+training, given minimum input. In contrast to existing state-of-the-art
+approaches, which rely on task-specific modules, our proposed model is entirely
+based on a transformer architecture, which can benefit from recently proposed
+3D-geometry general pretraining. We conduct extensive experiments and report
+state-of-the-art one-shot performance on the challenging LINEMOD benchmark.
+Finally, extensive ablations allow us to determine good practices with this
+relatively new type of architecture in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective and Parameter-Efficient Reusing Fine-Tuned Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisen Jiang, Baijiong Lin, Han Shi, Yu Zhang, and Zhenguo Li, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many pre-trained large-scale models provided online have become highly
+effective in transferring to downstream tasks. At the same time, various
+task-specific models fine-tuned on these pre-trained models are available
+online for public use. In practice, as collecting task-specific data is
+labor-intensive and fine-tuning the large pre-trained models is computationally
+expensive, one can reuse task-specific finetuned models to deal with downstream
+tasks. However, using a model per task causes a heavy burden on storage and
+serving. Recently, many training-free and parameter-efficient methods have been
+proposed for reusing multiple fine-tuned task-specific models into a single
+multi-task model. However, these methods exhibit a large accuracy gap compared
+with using a fine-tuned model per task. In this paper, we propose
+Parameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing
+Fully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task
+vector into a merged model by magnitude pruning. For reusing LoRA fine-tuned
+models, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA
+matrix by singular value decomposition. Both PERUFFT and PERU-LoRA are
+training-free. Extensive experiments conducted on computer vision and natural
+language process tasks demonstrate the effectiveness and parameter-efficiency
+of the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform
+existing reusing model methods by a large margin and achieve comparable
+performance to using a fine-tuned model per task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Multi-NeRF: Exploit Efficient Parallelism in Adaptive Multiple
+  Scale Neural Radiance Field Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Wang, Shuichi Kurabayashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Neural Radiance Fields (NeRF) have demonstrated
+significant potential for representing 3D scene appearances as implicit neural
+networks, enabling the synthesis of high-fidelity novel views. However, the
+lengthy training and rendering process hinders the widespread adoption of this
+promising technique for real-time rendering applications. To address this
+issue, we present an effective adaptive multi-NeRF method designed to
+accelerate the neural rendering process for large scenes with unbalanced
+workloads due to varying scene complexities.
+  Our method adaptively subdivides scenes into axis-aligned bounding boxes
+using a tree hierarchy approach, assigning smaller NeRFs to different-sized
+subspaces based on the complexity of each scene portion. This ensures the
+underlying neural representation is specific to a particular part of the scene.
+We optimize scene subdivision by employing a guidance density grid, which
+balances representation capability for each Multilayer Perceptron (MLP).
+Consequently, samples generated by each ray can be sorted and collected for
+parallel inference, achieving a balanced workload suitable for small MLPs with
+consistent dimensions for regular and GPU-friendly computations. We aosl
+demonstrated an efficient NeRF sampling strategy that intrinsically adapts to
+increase parallelism, utilization, and reduce kernel calls, thereby achieving
+much higher GPU utilization and accelerating the rendering process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Dual Attentive Generative Adversarial Network for Remote Sensing Image
+  Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyi Qiu, Xiaofeng Zhang, ChaoChen Gu, and ShanYing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing change detection between bi-temporal images receives growing
+concentration from researchers. However, comparing two bi-temporal images for
+detecting changes is challenging, as they demonstrate different appearances. In
+this paper, we propose a dual attentive generative adversarial network for
+achieving very high-resolution remote sensing image change detection tasks,
+which regards the detection model as a generator and attains the optimal
+weights of the detection model without increasing the parameters of the
+detection model through generative-adversarial strategy, boosting the spatial
+contiguity of predictions. Moreover, We design a multi-level feature extractor
+for effectively fusing multi-level features, which adopts the pre-trained model
+to extract multi-level features from bi-temporal images and introduces
+aggregate connections to fuse them. To strengthen the identification of
+multi-scale objects, we propose a multi-scale adaptive fusion module to
+adaptively fuse multi-scale features through various receptive fields and
+design a context refinement module to explore contextual dependencies.
+Moreover, the DAGAN framework utilizes the 4-layer convolution network as a
+discriminator to identify whether the synthetic image is fake or real.
+Extensive experiments represent that the DAGAN framework has better performance
+with 85.01% mean IoU and 91.48% mean F1 score than advanced methods on the
+LEVIR dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shifting More Attention to Breast Lesion Segmentation in Ultrasound
+  Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhao Lin, Qian Dai, Lei Zhu, Huazhu Fu, Qiong Wang, Weibin Li, Wenhao Rao, Xiaoyang Huang, Liansheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast lesion segmentation in ultrasound (US) videos is essential for
+diagnosing and treating axillary lymph node metastasis. However, the lack of a
+well-established and large-scale ultrasound video dataset with high-quality
+annotations has posed a persistent challenge for the research community. To
+overcome this issue, we meticulously curated a US video breast lesion
+segmentation dataset comprising 572 videos and 34,300 annotated frames,
+covering a wide range of realistic clinical scenarios. Furthermore, we propose
+a novel frequency and localization feature aggregation network (FLA-Net) that
+learns temporal features from the frequency domain and predicts additional
+lesion location positions to assist with breast lesion segmentation. We also
+devise a localization-based contrastive loss to reduce the lesion location
+distance between neighboring video frames within the same video and enlarge the
+location distances between frames from different ultrasound videos. Our
+experiments on our annotated dataset and two public video polyp segmentation
+datasets demonstrate that our proposed FLA-Net achieves state-of-the-art
+performance in breast lesion segmentation in US videos and video polyp
+segmentation while significantly reducing time and space complexity. Our model
+and dataset are available at https://github.com/jhl-Det/FLA-Net.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LanguageBind: Extending Video-Language <span class="highlight-title">Pretrain</span>ing to N-modality by
+  Language-based Semantic Alignment <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, Wang HongFa, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Cai Wan Zhang, Zhifeng Li, Wei Liu, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The video-language (VL) pretraining has achieved remarkable improvement in
+multiple downstream tasks. However, the current VL pretraining framework is
+hard to extend to multiple modalities (N modalities, N>=3) beyond vision and
+language. We thus propose LanguageBind, taking the language as the bind across
+different modalities because the language modality is well-explored and
+contains rich semantics. Specifically, we freeze the language encoder acquired
+by VL pretraining, then train encoders for other modalities with contrastive
+learning. As a result, all modalities are mapped to a shared feature space,
+implementing multi-modal semantic alignment. While LanguageBind ensures that we
+can extend VL modalities to N modalities, we also need a high-quality dataset
+with alignment data pairs centered on language. We thus propose VIDAL-10M with
+Video, Infrared, Depth, Audio and their corresponding Language, naming as
+VIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with
+complete semantics rather than truncated segments from long videos, and all the
+video, depth, infrared, and audio modalities are aligned to their textual
+descriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 1.2%
+R@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot
+video-text retrieval, validating the high quality of our dataset. Beyond this,
+our LanguageBind has achieved great improvement in the zero-shot video, audio,
+depth, and infrared understanding tasks. For instance, on the LLVIP and NYU-D
+datasets, LanguageBind outperforms ImageBind-huge with 23.8% and 11.1% top-1
+accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review as a conference paper at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Refinement of Buildings' Segmentation Models using SAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Mayladan, Hasan Nasrallah, Hasan Moughnieh, Mustafa Shukor, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have excelled in various tasks but are often evaluated on
+general benchmarks. The adaptation of these models for specific domains, such
+as remote sensing imagery, remains an underexplored area. In remote sensing,
+precise building instance segmentation is vital for applications like urban
+planning. While Convolutional Neural Networks (CNNs) perform well, their
+generalization can be limited. For this aim, we present a novel approach to
+adapt foundation models to address existing models' generalization dropback.
+Among several models, our focus centers on the Segment Anything Model (SAM), a
+potent foundation model renowned for its prowess in class-agnostic image
+segmentation capabilities. We start by identifying the limitations of SAM,
+revealing its suboptimal performance when applied to remote sensing imagery.
+Moreover, SAM does not offer recognition abilities and thus fails to classify
+and tag localized objects. To address these limitations, we introduce different
+prompting strategies, including integrating a pre-trained CNN as a prompt
+generator. This novel approach augments SAM with recognition abilities, a first
+of its kind. We evaluated our method on three remote sensing datasets,
+including the WHU Buildings dataset, the Massachusetts Buildings dataset, and
+the AICrowd Mapping Challenge. For out-of-distribution performance on the WHU
+dataset, we achieve a 5.47% increase in IoU and a 4.81% improvement in
+F1-score. For in-distribution performance on the WHU dataset, we observe a
+2.72% and 1.58% increase in True-Positive-IoU and True-Positive-F1 score,
+respectively. We intend to release our code repository, hoping to inspire
+further exploration of foundation models for domain-specific tasks within the
+remote sensing community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Selective Feature Adapter for Dense Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqing Deng, Qi Fan, Xiaojie Jin, Linjie Yang, Peng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pre-trained transformer models, e.g., Swin Transformer, are
+successful in numerous downstream for dense prediction vision tasks. However,
+one major issue is the cost/storage of their huge amount of parameters, which
+becomes increasingly challenging to handle with the growing amount of vision
+tasks. In this paper, we propose an effective approach to alleviate the issue,
+namely selective feature adapter (SFA). It achieves state-of-the-art (SoTA)
+performance under any given budget of trainable parameters, and demonstrates
+comparable or better performance than fully fine-tuned models across various
+dense tasks. Specifically, SFA consists of external adapters and internal
+adapters which are sequentially operated over a transformer model. For external
+adapters, we properly select the places and amount of additional multilayer
+perception (MLP). For internal adapters, we transform a few task-important
+parameters inside the transformer, which are automatically discovered through a
+simple yet effective lottery ticket algorithm. Our experiments show that the
+dual adapter module, a.k.a SFA, is essential to achieve the best trade-off on
+dense vision tasks, such as segmentation, detection and depth-estimation,
+outperforming other adapters with a single module.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelfGraphVQA: A <span class="highlight-title">Self-Supervised</span> Graph Neural Network for Scene-based
+  Question Answering <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Souza, Marius Aasan, Helio Pedrini, Adín Ramírez Rivera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The intersection of vision and language is of major interest due to the
+increased focus on seamless integration between recognition and reasoning.
+Scene graphs (SGs) have emerged as a useful tool for multimodal image analysis,
+showing impressive performance in tasks such as Visual Question Answering
+(VQA). In this work, we demonstrate that despite the effectiveness of scene
+graphs in VQA tasks, current methods that utilize idealized annotated scene
+graphs struggle to generalize when using predicted scene graphs extracted from
+images. To address this issue, we introduce the SelfGraphVQA framework. Our
+approach extracts a scene graph from an input image using a pre-trained scene
+graph generator and employs semantically-preserving augmentation with
+self-supervised techniques. This method improves the utilization of graph
+representations in VQA tasks by circumventing the need for costly and
+potentially biased annotated data. By creating alternative views of the
+extracted graphs through image augmentations, we can learn joint embeddings by
+optimizing the informational content in their representations using an
+un-normalized contrastive approach. As we work with SGs, we experiment with
+three distinct maximization strategies: node-wise, graph-wise, and
+permutation-equivariant regularization. We empirically showcase the
+effectiveness of the extracted scene graph for VQA and demonstrate that these
+approaches enhance overall performance by highlighting the significance of
+visual information. This offers a more practical solution for VQA tasks that
+rely on SGs for complex reasoning questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Vision-and-Language Algorithmic Reasoning Workshop at
+  ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> High Dynamic Range Imaging with Multi-Exposure Images in
+  Dynamic Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhilu Zhang, Haoyu Wang, Shuai Liu, Xiaotao Wang, Lei Lei, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Merging multi-exposure images is a common approach for obtaining high dynamic
+range (HDR) images, with the primary challenge being the avoidance of ghosting
+artifacts in dynamic scenes. Recent methods have proposed using deep neural
+networks for deghosting. However, the methods typically rely on sufficient data
+with HDR ground-truths, which are difficult and costly to collect. In this
+work, to eliminate the need for labeled data, we propose SelfHDR, a
+self-supervised HDR reconstruction method that only requires dynamic
+multi-exposure images during training. Specifically, SelfHDR learns a
+reconstruction network under the supervision of two complementary components,
+which can be constructed from multi-exposure images and focus on HDR color as
+well as structure, respectively. The color component is estimated from aligned
+multi-exposure images, while the structure one is generated through a
+structure-focused network that is supervised by the color component and an
+input reference (\eg, medium-exposure) image. During testing, the learned
+reconstruction network is directly deployed to predict an HDR image.
+Experiments on real-world images demonstrate our SelfHDR achieves superior
+results against the state-of-the-art self-supervised methods, and comparable
+performance to supervised ones. Codes are available at
+https://github.com/cszhilu1998/SelfHDR
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Karim Gizzini, Mustafa Shukor, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current AI-based methods do not provide comprehensible physical
+interpretations of the utilized data, extracted features, and
+predictions/inference operations. As a result, deep learning models trained
+using high-resolution satellite imagery lack transparency and explainability
+and can be merely seen as a black box, which limits their wide-level adoption.
+Experts need help understanding the complex behavior of AI models and the
+underlying decision-making process. The explainable artificial intelligence
+(XAI) field is an emerging field providing means for robust, practical, and
+trustworthy deployment of AI models. Several XAI techniques have been proposed
+for image classification tasks, whereas the interpretation of image
+segmentation remains largely unexplored. This paper offers to bridge this gap
+by adapting the recent XAI classification algorithms and making them usable for
+muti-class image segmentation, where we mainly focus on buildings' segmentation
+from high-resolution satellite images. To benchmark and compare the performance
+of the proposed approaches, we introduce a new XAI evaluation methodology and
+metric based on "Entropy" to measure the model uncertainty. Conventional XAI
+evaluation methods rely mainly on feeding area-of-interest regions from the
+image back to the pre-trained (utility) model and then calculating the average
+change in the probability of the target class. Those evaluation metrics lack
+the needed robustness, and we show that using Entropy to monitor the model
+uncertainty in segmenting the pixels within the target class is more suitable.
+We hope this work will pave the way for additional XAI research for image
+segmentation and applications in the remote sensing discipline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skin the sheep not only once: Reusing Various Depth <span class="highlight-title">Dataset</span>s to Drive
+  the Learning of Optical Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng-Chi Huang, Wei-Chen Chiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical flow estimation is crucial for various applications in vision and
+robotics. As the difficulty of collecting ground truth optical flow in
+real-world scenarios, most of the existing methods of learning optical flow
+still adopt synthetic dataset for supervised training or utilize photometric
+consistency across temporally adjacent video frames to drive the unsupervised
+learning, where the former typically has issues of generalizability while the
+latter usually performs worse than the supervised ones. To tackle such
+challenges, we propose to leverage the geometric connection between optical
+flow estimation and stereo matching (based on the similarity upon finding pixel
+correspondences across images) to unify various real-world depth estimation
+datasets for generating supervised training data upon optical flow.
+Specifically, we turn the monocular depth datasets into stereo ones via
+synthesizing virtual disparity, thus leading to the flows along the horizontal
+direction; moreover, we introduce virtual camera motion into stereo data to
+produce additional flows along the vertical direction. Furthermore, we propose
+applying geometric augmentations on one image of an optical flow pair,
+encouraging the optical flow estimator to learn from more challenging cases.
+Lastly, as the optical flow maps under different geometric augmentations
+actually exhibit distinct characteristics, an auxiliary classifier which trains
+to identify the type of augmentation from the appearance of the flow map is
+utilized to further enhance the learning of the optical flow estimator. Our
+proposed method is general and is not tied to any particular flow estimator,
+where extensive experiments based on various datasets and optical flow
+estimation models verify its efficacy and superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-Generated Images as Data Source: The Dawn of Synthetic Era 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuhao Yang, Fangneng Zhan, Kunhao Liu, Muyu Xu, Shijian Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of visual intelligence is intrinsically tethered to the
+availability of data. In parallel, generative Artificial Intelligence (AI) has
+unlocked the potential to create synthetic images that closely resemble
+real-world photographs, which prompts a compelling inquiry: how visual
+intelligence benefit from the advance of generative AI? This paper explores the
+innovative concept of harnessing these AI-generated images as a new data
+source, reshaping traditional model paradigms in visual intelligence. In
+contrast to real data, AI-generated data sources exhibit remarkable advantages,
+including unmatched abundance and scalability, the rapid generation of vast
+datasets, and the effortless simulation of edge cases. Built on the success of
+generative AI models, we examines the potential of their generated data in a
+range of applications, from training machine learning models to simulating
+scenarios for computational modelling, testing, and validation. We probe the
+technological foundations that support this groundbreaking use of generative
+AI, engaging in an in-depth discussion on the ethical, legal, and practical
+considerations that accompany this transformative paradigm shift. Through an
+exhaustive survey of current technologies and applications, this paper presents
+a comprehensive view of the synthetic era in visual intelligence. A project
+with this paper can be found at https://github.com/mwxely/AIGS .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trainable Noise Model as an XAI evaluation method: application on Sobol
+  for remote sensing image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Shreim, Abdul Karim Gizzini, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  eXplainable Artificial Intelligence (XAI) has emerged as an essential
+requirement when dealing with mission-critical applications, ensuring
+transparency and interpretability of the employed black box AI models. The
+significance of XAI spans various domains, from healthcare to finance, where
+understanding the decision-making process of deep learning algorithms is
+essential. Most AI-based computer vision models are often black boxes; hence,
+providing explainability of deep neural networks in image processing is crucial
+for their wide adoption and deployment in medical image analysis, autonomous
+driving, and remote sensing applications. Recently, several XAI methods for
+image classification tasks have been introduced. On the contrary, image
+segmentation has received comparatively less attention in the context of
+explainability, although it is a fundamental task in computer vision
+applications, especially in remote sensing. Only some research proposes
+gradient-based XAI algorithms for image segmentation. This paper adapts the
+recent gradient-free Sobol XAI method for semantic segmentation. To measure the
+performance of the Sobol method for segmentation, we propose a quantitative XAI
+evaluation method based on a learnable noise model. The main objective of this
+model is to induce noise on the explanation maps, where higher induced noise
+signifies low accuracy and vice versa. A benchmark analysis is conducted to
+evaluate and compare performance of three XAI methods, including Seg-Grad-CAM,
+Seg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation
+technique. This constitutes the first attempt to run and evaluate XAI methods
+using high-resolution satellite images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Hasan Zahweh, Hasan Nasrallah, Mustafa Shukor, Ghaleb Faour, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced
+significant growth and have been extensively employed to adapt large vision and
+language models to various domains, enabling satisfactory model performance
+with minimal computational needs. Despite these advances, more research has yet
+to delve into potential PEFT applications in real-life scenarios, particularly
+in the critical domains of remote sensing and crop monitoring. The diversity of
+climates across different regions and the need for comprehensive large-scale
+datasets have posed significant obstacles to accurately identify crop types
+across varying geographic locations and changing growing seasons. This study
+seeks to bridge this gap by comprehensively exploring the feasibility of
+cross-area and cross-year out-of-distribution generalization using the
+State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to
+explore PEFT approaches for crop monitoring. Specifically, we focus on adapting
+the SOTA TSViT model to address winter wheat field segmentation, a critical
+task for crop monitoring and food security. This adaptation process involves
+integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and
+prompt tuning. Using PEFT techniques, we achieved notable results comparable to
+those achieved using full fine-tuning methods while training only a mere 0.7%
+parameters of the whole TSViT architecture. The in-house labeled data-set,
+referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated
+polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over
+five consecutive years. Using Sentinel-2 images, our model achieved a 84%
+F1-score. We intend to publicly release the Lebanese winter wheat data set,
+code repository, and model weights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuhiro Kaneko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields (NeRFs) have shown impressive results for novel view
+synthesis. However, they depend on the repetitive use of a single-input
+single-output multilayer perceptron (SISO MLP) that maps 3D coordinates and
+view direction to the color and volume density in a sample-wise manner, which
+slows the rendering. We propose a multi-input multi-output NeRF (MIMO-NeRF)
+that reduces the number of MLPs running by replacing the SISO MLP with a MIMO
+MLP and conducting mappings in a group-wise manner. One notable challenge with
+this approach is that the color and volume density of each point can differ
+according to a choice of input coordinates in a group, which can lead to some
+notable ambiguity. We also propose a self-supervised learning method that
+regularizes the MIMO MLP with multiple fast reformulated MLPs to alleviate this
+ambiguity without using pretrained models. The results of a comprehensive
+experimental evaluation including comparative and ablation studies are
+presented to show that MIMO-NeRF obtains a good trade-off between speed and
+quality with a reasonable training time. We then demonstrate that MIMO-NeRF is
+compatible with and complementary to previous advancements in NeRFs by applying
+it to two representative fast NeRFs, i.e., a NeRF with sample reduction
+(DONeRF) and a NeRF with alternative representations (TensoRF).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Project page:
+  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/mimo-nerf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amazing Combinatorial Creation: Acceptable Swap-Sampling for
+  Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Li, Zedong Zhang, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploring a machine learning system to generate meaningful combinatorial
+object images from multiple textual descriptions, emulating human creativity,
+is a significant challenge as humans are able to construct amazing
+combinatorial objects, but machines strive to emulate data distribution. In
+this paper, we develop a straightforward yet highly effective technique called
+acceptable swap-sampling to generate a combinatorial object image that exhibits
+novelty and surprise, utilizing text concepts of different objects. Initially,
+we propose a swapping mechanism that constructs a novel embedding by exchanging
+column vectors of two text embeddings for generating a new combinatorial image
+through a cutting-edge diffusion model. Furthermore, we design an acceptable
+region by managing suitable CLIP distances between the new image and the
+original concept generations, increasing the likelihood of accepting the new
+image with a high-quality combination. This region allows us to efficiently
+sample a small subset from a new image pool generated by using randomly
+exchanging column vectors. Lastly, we employ a segmentation method to compare
+CLIP distances among the segmented components, ultimately selecting the most
+promising object image from the sampled subset. Our experiments focus on text
+pairs of objects from ImageNet, and our results demonstrate that our approach
+outperforms recent methods such as Stable-Diffusion2, DALLE2, ERNIE-ViLG2 and
+Bing in generating novel and surprising object images, even when the associated
+concepts appear to be implausible, such as lionfish-abacus. Furthermore, during
+the sampling process, our approach without training and human preference is
+also comparable to PickScore and HPSv2 trained using human preference datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: \url{https://asst2i.github.io/anon/}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PPT: Token Pruning and Pooling for Efficient Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01812v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01812v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjian Wu, Fanhu Zeng, Xiudong Wang, Yunhe Wang, Xinghao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have emerged as powerful models in the field of
+computer vision, delivering superior performance across various vision tasks.
+However, the high computational complexity poses a significant barrier to their
+practical applications in real-world scenarios. Motivated by the fact that not
+all tokens contribute equally to the final predictions and fewer tokens bring
+less computational cost, reducing redundant tokens has become a prevailing
+paradigm for accelerating vision transformers. However, we argue that it is not
+optimal to either only reduce inattentive redundancy by token pruning, or only
+reduce duplicative redundancy by token merging. To this end, in this paper we
+propose a novel acceleration framework, namely token Pruning & Pooling
+Transformers (PPT), to adaptively tackle these two types of redundancy in
+different layers. By heuristically integrating both token pruning and token
+pooling techniques in ViTs without additional trainable parameters, PPT
+effectively reduces the model complexity while maintaining its predictive
+accuracy. For example, PPT reduces over 37% FLOPs and improves the throughput
+by over 45% for DeiT-S without any accuracy drop on the ImageNet dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improvement and Enhancement of YOLOv5 Small Target Recognition Based on
+  Multi-module Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyang Li, Yuchen Li, Hongyi Duan, JiaLiang Kang, Jianan Zhang, Xueqian Gan, Ruotong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the limitations of YOLOv5s model on small target detection
+task are deeply studied and improved. The performance of the model is
+successfully enhanced by introducing GhostNet-based convolutional module,
+RepGFPN-based Neck module optimization, CA and Transformer's attention
+mechanism, and loss function improvement using NWD. The experimental results
+validate the positive impact of these improvement strategies on model
+precision, recall and mAP. In particular, the improved model shows significant
+superiority in dealing with complex backgrounds and tiny targets in real-world
+application tests. This study provides an effective optimization strategy for
+the YOLOv5s model on small target detection, and lays a solid foundation for
+future related research and applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMRD: SURE-based Robust MRI Reconstruction with Diffusion Models <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Batu Ozturkler, Chao Liu, Benjamin Eckart, Morteza Mardani, Jiaming Song, Jan Kautz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently gained popularity for accelerated MRI
+reconstruction due to their high sample quality. They can effectively serve as
+rich data priors while incorporating the forward model flexibly at inference
+time, and they have been shown to be more robust than unrolled methods under
+distribution shifts. However, diffusion models require careful tuning of
+inference hyperparameters on a validation set and are still sensitive to
+distribution shifts during testing. To address these challenges, we introduce
+SURE-based MRI Reconstruction with Diffusion models (SMRD), a method that
+performs test-time hyperparameter tuning to enhance robustness during testing.
+SMRD uses Stein's Unbiased Risk Estimator (SURE) to estimate the mean squared
+error of the reconstruction during testing. SURE is then used to automatically
+tune the inference hyperparameters and to set an early stopping criterion
+without the need for validation tuning. To the best of our knowledge, SMRD is
+the first to incorporate SURE into the sampling stage of diffusion models for
+automatic hyperparameter selection. SMRD outperforms diffusion model baselines
+on various measurement noise levels, acceleration factors, and anatomies,
+achieving a PSNR improvement of up to 6 dB under measurement noise. The code is
+publicly available at https://github.com/batuozt/SMRD .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HallE-Switch: Rethinking and Controlling Object Existence Hallucinations
+  in Large Vision Language Models for Detailed Caption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohan Zhai, Shijia Yang, Xiangchen Zhao, Chenfeng Xu, Sheng Shen, Dongdi Zhao, Kurt Keutzer, Manling Li, Tan Yan, Xiangjun Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current large vision-language models (LVLMs) achieve remarkable progress, yet
+there remains significant uncertainty regarding their ability to accurately
+apprehend visual details, that is, in performing detailed captioning. To
+address this, we introduce \textit{CCEval}, a GPT-4 assisted evaluation method
+tailored for detailed captioning. Interestingly, while LVLMs demonstrate
+minimal object existence hallucination in existing VQA benchmarks, our proposed
+evaluation reveals continued susceptibility to such hallucinations. In this
+paper, we make the first attempt to investigate and attribute such
+hallucinations, including image resolution, the language decoder size, and
+instruction data amount, quality, granularity. Our findings underscore the
+unwarranted inference when the language description includes details at a finer
+object granularity than what the vision module can ground or verify, thus
+inducing hallucination. To control such hallucinations, we further attribute
+the reliability of captioning to contextual knowledge (involving only
+contextually grounded objects) and parametric knowledge (containing inferred
+objects by the model). Thus, we introduce $\textit{HallE-Switch}$, a
+controllable LVLM in terms of $\textbf{Hall}$ucination in object
+$\textbf{E}$xistence. HallE-Switch can condition the captioning to shift
+between (i) exclusively depicting contextual knowledge for grounded objects and
+(ii) blending it with parametric knowledge to imagine inferred objects. Our
+method reduces hallucination by 44% compared to LLaVA$_{7B}$ and maintains the
+same object coverage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ImageNet-OOD: Deciphering Modern Out-of-Distribution Detection
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Yang, Byron Zhang, Olga Russakovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of out-of-distribution (OOD) detection is notoriously ill-defined.
+Earlier works focused on new-class detection, aiming to identify label-altering
+data distribution shifts, also known as "semantic shift." However, recent works
+argue for a focus on failure detection, expanding the OOD evaluation framework
+to account for label-preserving data distribution shifts, also known as
+"covariate shift." Intriguingly, under this new framework, complex OOD
+detectors that were previously considered state-of-the-art now perform
+similarly to, or even worse than the simple maximum softmax probability
+baseline. This raises the question: what are the latest OOD detectors actually
+detecting? Deciphering the behavior of OOD detection algorithms requires
+evaluation datasets that decouples semantic shift and covariate shift. To aid
+our investigations, we present ImageNet-OOD, a clean semantic shift dataset
+that minimizes the interference of covariate shift. Through comprehensive
+experiments, we show that OOD detectors are more sensitive to covariate shift
+than to semantic shift, and the benefits of recent OOD detection algorithms on
+semantic shift detection is minimal. Our dataset and analyses provide important
+insights for guiding the design of future OOD detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Expected Appearances for Intraoperative Registration during
+  Neurosurgery <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nazim Haouchine, Reuben Dorent, Parikshit Juvekar, Erickson Torio, William M. Wells III, Tina Kapur, Alexandra J. Golby, Sarah Frisken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method for intraoperative patient-to-image registration by
+learning Expected Appearances. Our method uses preoperative imaging to
+synthesize patient-specific expected views through a surgical microscope for a
+predicted range of transformations. Our method estimates the camera pose by
+minimizing the dissimilarity between the intraoperative 2D view through the
+optical microscope and the synthesized expected texture. In contrast to
+conventional methods, our approach transfers the processing tasks to the
+preoperative stage, reducing thereby the impact of low-resolution, distorted,
+and noisy intraoperative images, that often degrade the registration accuracy.
+We applied our method in the context of neuronavigation during brain surgery.
+We evaluated our approach on synthetic data and on retrospective data from 6
+clinical cases. Our method outperformed state-of-the-art methods and achieved
+accuracies that met current clinical standards.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Autoencoding of Dropout Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunta Maeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a generative model termed Deciphering Autoencoders. In this model,
+we assign a unique random dropout pattern to each data point in the training
+dataset and then train an autoencoder to reconstruct the corresponding data
+point using this pattern as information to be encoded. Since the training of
+Deciphering Autoencoders relies solely on reconstruction error, it offers more
+stable training than other generative models. Despite its simplicity,
+Deciphering Autoencoders show comparable sampling quality to DCGAN on the
+CIFAR-10 dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eye Fairness: A Large-Scale 3D Imaging <span class="highlight-title">Dataset</span> for Equitable Eye
+  Diseases Screening and Fair Identity Scaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Luo, Yu Tian, Min Shi, Tobias Elze, Mengyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness or equity in machine learning is profoundly important for societal
+well-being, but limited public datasets hinder its progress, especially in the
+area of medicine. It is undeniable that fairness in medicine is one of the most
+important areas for fairness learning's applications. Currently, no large-scale
+public medical datasets with 3D imaging data for fairness learning are
+available, while 3D imaging data in modern clinics are standard tests for
+disease diagnosis. In addition, existing medical fairness datasets are actually
+repurposed datasets, and therefore they typically have limited demographic
+identity attributes with at most three identity attributes of age, gender, and
+race for fairness modeling. To address this gap, we introduce our Eye Fairness
+dataset with 30,000 subjects (Harvard-EF) covering three major eye diseases
+including age-related macular degeneration, diabetic retinopathy, and glaucoma
+affecting 380 million patients globally. Our Harvard-EF dataset includes both
+2D fundus photos and 3D optical coherence tomography scans with six demographic
+identity attributes including age, gender, race, ethnicity, preferred language,
+and marital status. We also propose a fair identity scaling (FIS) approach
+combining group and individual scaling together to improve model fairness. Our
+FIS approach is compared with various state-of-the-art fairness learning
+methods with superior performance in the racial, gender, and ethnicity fairness
+tasks with 2D and 3D imaging data, which demonstrate the utilities of our
+Harvard-EF dataset for fairness learning. To facilitate fairness comparisons
+between different models, we propose performance-scaled disparity measures,
+which can be used to compare model fairness accounting for overall performance
+levels. The dataset and code are publicly accessible via
+\url{https://ophai.hms.harvard.edu/datasets/harvard-ef30k}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OCU-Net: A Novel U-Net Architecture for Enhanced Oral Cancer
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Albishri, Syed Jawad Hussain Shah, Yugyung Lee, Rong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection of oral cancer is crucial for improving patient outcomes.
+However, the field faces two key challenges: the scarcity of deep
+learning-based image segmentation research specifically targeting oral cancer
+and the lack of annotated data. Our study proposes OCU-Net, a pioneering U-Net
+image segmentation architecture exclusively designed to detect oral cancer in
+hematoxylin and eosin (H&E) stained image datasets. OCU-Net incorporates
+advanced deep learning modules, such as the Channel and Spatial Attention
+Fusion (CSAF) module, a novel and innovative feature that emphasizes important
+channel and spatial areas in H&E images while exploring contextual information.
+In addition, OCU-Net integrates other innovative components such as
+Squeeze-and-Excite (SE) attention module, Atrous Spatial Pyramid Pooling (ASPP)
+module, residual blocks, and multi-scale fusion. The incorporation of these
+modules showed superior performance for oral cancer segmentation for two
+datasets used in this research. Furthermore, we effectively utilized the
+efficient ImageNet pre-trained MobileNet-V2 model as a backbone of our OCU-Net
+to create OCU-Netm, an enhanced version achieving state-of-the-art results.
+Comprehensive evaluation demonstrates that OCU-Net and OCU-Netm outperformed
+existing segmentation methods, highlighting their precision in identifying
+cancer cells in H&E images from OCDC and ORCA datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EvDNeRF: Reconstructing Event Data with Dynamic Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anish Bhattacharya, Ratnesh Madaan, Fernando Cladera, Sai Vemprala, Rogerio Bonatti, Kostas Daniilidis, Ashish Kapoor, Vijay Kumar, Nikolai Matni, Jayesh K. Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present EvDNeRF, a pipeline for generating event data and training an
+event-based dynamic NeRF, for the purpose of faithfully reconstructing
+eventstreams on scenes with rigid and non-rigid deformations that may be too
+fast to capture with a standard camera. Event cameras register asynchronous
+per-pixel brightness changes at MHz rates with high dynamic range, making them
+ideal for observing fast motion with almost no motion blur. Neural radiance
+fields (NeRFs) offer visual-quality geometric-based learnable rendering, but
+prior work with events has only considered reconstruction of static scenes. Our
+EvDNeRF can predict eventstreams of dynamic scenes from a static or moving
+viewpoint between any desired timestamps, thereby allowing it to be used as an
+event-based simulator for a given scene. We show that by training on varied
+batch sizes of events, we can improve test-time predictions of events at fine
+time resolutions, outperforming baselines that pair standard dynamic NeRFs with
+event simulators. We release our simulated and real datasets, as well as code
+for both event-based data generation and the training of event-based dynamic
+NeRF models (https://github.com/anish-bhattacharya/EvDNeRF).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 20 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EditVal: Benchmarking Diffusion Based Text-Guided Image Editing Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samyadeep Basu, Mehrdad Saberi, Shweta Bhardwaj, Atoosa Malemir Chegini, Daniela Massiceti, Maziar Sanjabi, Shell Xu Hu, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A plethora of text-guided image editing methods have recently been developed
+by leveraging the impressive capabilities of large-scale diffusion-based
+generative models such as Imagen and Stable Diffusion. A standardized
+evaluation protocol, however, does not exist to compare methods across
+different types of fine-grained edits. To address this gap, we introduce
+EditVal, a standardized benchmark for quantitatively evaluating text-guided
+image editing methods. EditVal consists of a curated dataset of images, a set
+of editable attributes for each image drawn from 13 possible edit types, and an
+automated evaluation pipeline that uses pre-trained vision-language models to
+assess the fidelity of generated images for each edit type. We use EditVal to
+benchmark 8 cutting-edge diffusion-based editing methods including SINE, Imagic
+and Instruct-Pix2Pix. We complement this with a large-scale human study where
+we show that EditVall's automated evaluation pipeline is strongly correlated
+with human-preferences for the edit types we considered. From both the human
+study and automated evaluation, we find that: (i) Instruct-Pix2Pix, Null-Text
+and SINE are the top-performing methods averaged across different edit types,
+however {\it only} Instruct-Pix2Pix and Null-Text are able to preserve original
+image properties; (ii) Most of the editing methods fail at edits involving
+spatial operations (e.g., changing the position of an object). (iii) There is
+no `winner' method which ranks the best individually across a range of
+different edit types. We hope that our benchmark can pave the way to developing
+more reliable text-guided image editing tools in the future. We will publicly
+release EditVal, and all associated code and human-study templates to support
+these research directions in https://deep-ml-research.github.io/editval/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedL2P: Federated Learning to Personalize <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Royson Lee, Minyoung Kim, Da Li, Xinchi Qiu, Timothy Hospedales, Ferenc Huszár, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) research has made progress in developing algorithms
+for distributed learning of global models, as well as algorithms for local
+personalization of those common models to the specifics of each client's local
+data distribution. However, different FL problems may require different
+personalization strategies, and it may not even be possible to define an
+effective one-size-fits-all personalization strategy for all clients: depending
+on how similar each client's optimal predictor is to that of the global model,
+different personalization strategies may be preferred. In this paper, we
+consider the federated meta-learning problem of learning personalization
+strategies. Specifically, we consider meta-nets that induce the batch-norm and
+learning rate parameters for each client given local data statistics. By
+learning these meta-nets through FL, we allow the whole FL network to
+collaborate in learning a customized personalization strategy for each client.
+Empirical results show that this framework improves on a range of standard
+hand-crafted personalization baselines in both label and feature shift
+situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 37th Conference on Neural Information Processing
+  Systems (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bag of Tricks for Fully Test-Time Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02416v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02416v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saypraseuth Mounsaveng, Florent Chiaroni, Malik Boudiaf, Marco Pedersoli, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully Test-Time Adaptation (TTA), which aims at adapting models to data
+drifts, has recently attracted wide interest. Numerous tricks and techniques
+have been proposed to ensure robust learning on arbitrary streams of unlabeled
+data. However, assessing the true impact of each individual technique and
+obtaining a fair comparison still constitutes a significant challenge. To help
+consolidate the community's knowledge, we present a categorization of selected
+orthogonal TTA techniques, including small batch normalization, stream
+rebalancing, reliable sample selection, and network confidence calibration. We
+meticulously dissect the effect of each approach on different scenarios of
+interest. Through our analysis, we shed light on trade-offs induced by those
+techniques between accuracy, the computational power required, and model
+complexity. We also uncover the synergy that arises when combining techniques
+and are able to establish new state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FT-Shield: A Watermark Against Unauthorized Fine-tuning in Text-to-Image
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingqian Cui, Jie Ren, Yuping Lin, Han Xu, Pengfei He, Yue Xing, Wenqi Fan, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generative models based on latent diffusion models (LDM) have
+demonstrated their outstanding ability in generating high-quality and
+high-resolution images according to language prompt. Based on these powerful
+latent diffusion models, various fine-tuning methods have been proposed to
+achieve the personalization of text-to-image diffusion models such as artistic
+style adaptation and human face transfer. However, the unauthorized usage of
+data for model personalization has emerged as a prevalent concern in relation
+to copyright violations. For example, a malicious user may use the fine-tuning
+technique to generate images which mimic the style of a painter without his/her
+permission. In light of this concern, we have proposed FT-Shield, a
+watermarking approach specifically designed for the fine-tuning of
+text-to-image diffusion models to aid in detecting instances of infringement.
+We develop a novel algorithm for the generation of the watermark to ensure that
+the watermark on the training images can be quickly and accurately transferred
+to the generated images of text-to-image diffusion models. A watermark will be
+detected on an image by a binary watermark detector if the image is generated
+by a model that has been fine-tuned using the protected watermarked images.
+Comprehensive experiments were conducted to validate the effectiveness of
+FT-Shield.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ScaleNet: An Unsupervised Representation Learning Method for Limited
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huili Huang, M. Mahdi Roozbahani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although large-scale labeled data are essential for deep convolutional neural
+networks (ConvNets) to learn high-level semantic visual representations, it is
+time-consuming and impractical to collect and annotate large-scale datasets. A
+simple and efficient unsupervised representation learning method named ScaleNet
+based on multi-scale images is proposed in this study to enhance the
+performance of ConvNets when limited information is available. The input images
+are first resized to a smaller size and fed to the ConvNet to recognize the
+rotation degree. Next, the ConvNet learns the rotation-prediction task for the
+original size images based on the parameters transferred from the previous
+model. The CIFAR-10 and ImageNet datasets are examined on different
+architectures such as AlexNet and ResNet50 in this study. The current study
+demonstrates that specific image features, such as Harris corner information,
+play a critical role in the efficiency of the rotation-prediction task. The
+ScaleNet supersedes the RotNet by ~7% in the limited CIFAR-10 dataset. The
+transferred parameters from a ScaleNet model with limited data improve the
+ImageNet Classification task by about 6% compared to the RotNet model. This
+study shows the capability of the ScaleNet method to improve other cutting-edge
+models such as SimCLR by learning effective features for classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by DAGM GCPR 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-<span class="highlight-title">Prompt</span> Fine-Tuning of Foundation Models for Enhanced Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangru Li, Yifei Zhang, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Segment Anything Model (SAM) is a powerful foundation model that
+introduced revolutionary advancements in natural image segmentation. However,
+its performance remains sub-optimal when delineating the intricate structure of
+biomedical images, where multiple organs and tissues intertwine in a single
+image. In this study, we introduce a novel fine-tuning framework that leverages
+SAM's ability to bundle and process multiple prompts per image and seeks to
+improve SAM's performance in medical images. We first curated a medical image
+dataset that consists of CT scans of lesions in various organs, each with two
+annotations for organs and lesions respectively. Then, we fine-tuned SAM's mask
+decoder within our framework by batching both bounding boxes generated from
+ground truth masks as reference. The batched prompt strategy we introduced not
+only addresses the inherent complexity and ambiguity often found in medical
+images but also substantially enhances performance metrics when applied onto a
+wide range of segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Data Fabrication in Collaborative Vehicular Perception: Attacks and
+  Countermeasures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12955v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12955v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingzhao Zhang, Shuowei Jin, Ruiyang Zhu, Jiachen Sun, Xumiao Zhang, Qi Alfred Chen, Z. Morley Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative perception, which greatly enhances the sensing capability of
+connected and autonomous vehicles (CAVs) by incorporating data from external
+resources, also brings forth potential security risks. CAVs' driving decisions
+rely on remote untrusted data, making them susceptible to attacks carried out
+by malicious participants in the collaborative perception system. However,
+security analysis and countermeasures for such threats are absent. To
+understand the impact of the vulnerability, we break the ground by proposing
+various real-time data fabrication attacks in which the attacker delivers
+crafted malicious data to victims in order to perturb their perception results,
+leading to hard brakes or increased collision risks. Our attacks demonstrate a
+high success rate of over 86% on high-fidelity simulated scenarios and are
+realizable in real-world experiments. To mitigate the vulnerability, we present
+a systematic anomaly detection approach that enables benign vehicles to jointly
+reveal malicious fabrication. It detects 91.5% of attacks with a false positive
+rate of 3% in simulated scenarios and significantly mitigates attack impacts in
+real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 24 figures, accepted by Usenix Security 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Photonic Accelerators for Image Segmentation in Autonomous Driving and
+  Defect Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lakshmi Nair, David Widemann, Brad Turcott, Nick Moore, Alexandra Wleklinski, Darius Bunandar, Ioannis Papavasileiou, Shihu Wang, Eric Logan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photonic computing promises faster and more energy-efficient deep neural
+network (DNN) inference than traditional digital hardware. Advances in photonic
+computing can have profound impacts on applications such as autonomous driving
+and defect detection that depend on fast, accurate and energy efficient
+execution of image segmentation models. In this paper, we investigate image
+segmentation on photonic accelerators to explore: a) the types of image
+segmentation DNN architectures that are best suited for photonic accelerators,
+and b) the throughput and energy efficiency of executing the different image
+segmentation models on photonic accelerators, along with the trade-offs
+involved therein. Specifically, we demonstrate that certain segmentation models
+exhibit negligible loss in accuracy (compared to digital float32 models) when
+executed on photonic accelerators, and explore the empirical reasoning for
+their robustness. We also discuss techniques for recovering accuracy in the
+case of models that do not perform well. Further, we compare throughput
+(inferences-per-second) and energy consumption estimates for different image
+segmentation workloads on photonic accelerators. We discuss the challenges and
+potential optimizations that can help improve the application of photonic
+accelerators to such computer vision tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning of Contextualized Local Visual Embeddings <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thalles Santos Silva, Helio Pedrini, Adín Ramírez Rivera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Contextualized Local Visual Embeddings (CLoVE), a self-supervised
+convolutional-based method that learns representations suited for dense
+prediction tasks. CLoVE deviates from current methods and optimizes a single
+loss function that operates at the level of contextualized local embeddings
+learned from output feature maps of convolution neural network (CNN) encoders.
+To learn contextualized embeddings, CLoVE proposes a normalized mult-head
+self-attention layer that combines local features from different parts of an
+image based on similarity. We extensively benchmark CLoVE's pre-trained
+representations on multiple datasets. CLoVE reaches state-of-the-art
+performance for CNN-based architectures in 4 dense prediction downstream tasks,
+including object detection, instance segmentation, keypoint detection, and
+dense pose estimation. Code:
+$\href{https://github.com/sthalles/CLoVE}{\text{https://github.com/sthalles/CLoVE}}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print. 4th Visual Inductive Priors for Data-Efficient Deep
+  Learning Workshop ICCV 2023. Code at
+  $\href{https://github.com/sthalles/CLoVE}{\text{this link}}$</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expanding Small-Scale <span class="highlight-title">Dataset</span>s with Guided Imagination <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13976v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13976v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Daquan Zhou, Bryan Hooi, Kai Wang, Jiashi Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The power of DNNs relies heavily on the quantity and quality of training
+data. However, collecting and annotating data on a large scale is often
+expensive and time-consuming. To address this issue, we explore a new task,
+termed dataset expansion, aimed at expanding a ready-to-use small dataset by
+automatically creating new labeled samples. To this end, we present a Guided
+Imagination Framework (GIF) that leverages cutting-edge generative models like
+DALL-E2 and Stable Diffusion (SD) to "imagine" and create informative new data
+from the input seed data. Specifically, GIF conducts data imagination by
+optimizing the latent features of the seed data in the semantically meaningful
+space of the prior model, resulting in the creation of photo-realistic images
+with new content. To guide the imagination towards creating informative samples
+for model training, we introduce two key criteria, i.e., class-maintained
+information boosting and sample diversity promotion. These criteria are
+verified to be essential for effective dataset expansion: GIF-SD obtains 13.5%
+higher model accuracy on natural image datasets than unguided expansion with
+SD. With these essential criteria, GIF successfully expands small datasets in
+various scenarios, boosting model accuracy by 36.9% on average over six natural
+image datasets and by 13.5% on average over three medical datasets. The source
+code is available at https://github.com/Vanint/DatasetExpansion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAMP-Net: Consistency-Aware Multi-Prior Network for Accelerated MRI
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11238v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11238v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Zhang, Xiaobo Li, Weitian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite promising advances in deep learning-based MRI reconstruction methods,
+restoring high-frequency image details and textures remains a challenging
+problem for accelerated MRI. To tackle this challenge, we propose a novel
+consistency-aware multi-prior network (CAMP-Net) for MRI reconstruction.
+CAMP-Net leverages the complementary nature of multiple prior knowledge and
+explores data redundancy between adjacent slices in the hybrid domain to
+improve image quality. It incorporates three interleaved modules respectively
+for image enhancement, k-space restoration, and calibration consistency to
+jointly learn consistency-aware multiple priors in an end-to-end fashion. The
+image enhancement module learns a coil-combined image prior to suppress
+noise-like artifacts, while the k-space restoration module explores multi-coil
+k-space correlations to recover high-frequency details. The calibration
+consistency module embeds the known physical properties of MRI acquisition to
+ensure consistency of k-space correlations extracted from measurements and the
+artifact-free image intermediate. The resulting low- and high-frequency
+reconstructions are hierarchically aggregated in a frequency fusion module and
+iteratively refined to progressively reconstruct the final image. We evaluated
+the generalizability and robustness of our method on three large public
+datasets with various accelerations and sampling patterns. Comprehensive
+experiments demonstrate that CAMP-Net outperforms state-of-the-art methods in
+terms of reconstruction quality and quantitative $T_2$ mapping.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faster and Accurate Neural Networks with Semantic Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sazzad Sayyed, Jonathan Ashdown, Francesco Restuccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNN) usually come with a significant computational
+burden. While approaches such as structured pruning and mobile-specific DNNs
+have been proposed, they incur drastic accuracy loss. In this paper we leverage
+the intrinsic redundancy in latent representations to reduce the computational
+load with limited loss in performance. We show that semantically similar inputs
+share many filters, especially in the earlier layers. Thus, semantically
+similar classes can be clustered to create cluster-specific subgraphs. To this
+end, we propose a new framework called Semantic Inference (SINF). In short,
+SINF (i) identifies the semantic cluster the object belongs to using a small
+additional classifier and (ii) executes the subgraph extracted from the base
+DNN related to that semantic cluster for inference. To extract each
+cluster-specific subgraph, we propose a new approach named Discriminative
+Capability Score (DCS) that finds the subgraph with the capability to
+discriminate among the members of a specific semantic cluster. DCS is
+independent from SINF and can be applied to any DNN. We benchmark the
+performance of DCS on the VGG16, VGG19, and ResNet50 DNNs trained on the
+CIFAR100 dataset against 6 state-of-the-art pruning approaches. Our results
+show that (i) SINF reduces the inference time of VGG19, VGG16, and ResNet50
+respectively by up to 35%, 29% and 15% with only 0.17%, 3.75%, and 6.75%
+accuracy loss (ii) DCS achieves respectively up to 3.65%, 4.25%, and 2.36%
+better accuracy with VGG16, VGG19, and ResNet50 with respect to existing
+discriminative scores (iii) when used as a pruning criterion, DCS achieves up
+to 8.13% accuracy gain with 5.82% less parameters than the existing state of
+the art work published at ICLR 2023 (iv) when considering per-cluster accuracy,
+SINF performs on average 5.73%, 8.38% and 6.36% better than the base VGG16,
+VGG19, and ResNet50.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures, conference format</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Contrastive Patch-Based Subspace Learning for Camera Image Signal
+  Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.00253v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.00253v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunhao Yang, Yi Wang, Chandrajit Bajaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera Image Signal Processing (ISP) pipelines can get appealing results in
+different image signal processing tasks. Nonetheless, the majority of these
+methods, including those employing an encoder-decoder deep architecture for the
+task, typically utilize a uniform filter applied consistently across the entire
+image. However, it is natural to view a camera image as heterogeneous, as the
+color intensity and the artificial noise are distributed vastly differently,
+even across the two-dimensional domain of a single image. Varied Moire ringing,
+motion blur, color-bleaching, or lens-based projection distortions can all
+potentially lead to a heterogeneous image artifact filtering problem. In this
+paper, we present a specific patch-based, local subspace deep neural network
+that improves Camera ISP to be robust to heterogeneous artifacts (especially
+image denoising). We call our three-fold deep-trained model the Patch Subspace
+Learning Autoencoder (PSL-AE). The PSL-AE model does not make assumptions
+regarding uniform levels of image distortion. Instead, it first encodes patches
+extracted from noisy a nd clean image pairs, with different artifact types or
+distortion levels, by contrastive learning. Then, the patches of each image are
+encoded into corresponding soft clusters within their suitable latent
+sub-space, utilizing a prior mixture model. Furthermore, the decoders undergo
+training in an unsupervised manner, specifically trained for the image patches
+present in each cluster. The experiments highlight the adaptability and
+efficacy through enhanced heterogeneous filtering, both from synthesized
+artifacts but also realistic SIDD image pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transforming <span class="highlight-title">Transformer</span>s for Resilient Lifelong Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08250v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08250v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay Savadikar, Michelle Dai, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifelong learning without catastrophic forgetting (i.e., resiliency) remains
+an open problem for deep neural networks. The prior art mostly focuses on
+convolutional neural networks. With the increasing dominance of Transformers in
+deep learning, it is a pressing need to study lifelong learning with
+Transformers. Due to the complexity of training Transformers in practice, for
+lifelong learning, a question naturally arises: Can Transformers be learned to
+grow in a task aware way, that is to be dynamically transformed by introducing
+lightweight learnable plastic components to the architecture, while retaining
+the parameter-heavy, but stable components at streaming tasks? To that end,
+motivated by the lifelong learning capability maintained by the functionality
+of Hippocampi in human brain, we explore what would be, and how to implement,
+Artificial Hippocampi (ArtiHippo) in Transformers. We present a method to
+identify, and learn to grow, ArtiHippo in Vision Transformers (ViTs) for
+resilient lifelong learning in four aspects: (i) Where to place ArtiHippo to
+enable plasticity while preserving the core function of ViTs at streaming
+tasks? (ii) How to represent and realize ArtiHippo to ensure expressivity and
+adaptivity for tackling tasks of different nature in lifelong learning? (iii)
+How to learn to grow ArtiHippo to exploit task synergies (i.e., the learned
+knowledge) and overcome catastrophic forgetting? (iv) How to harness the best
+of our proposed ArtiHippo and prompting-based approaches? In experiments, we
+test the proposed method on the challenging Visual Domain Decathlon (VDD)
+benchmark and the 5-Dataset benchmark under the task-incremental lifelong
+learning setting. It obtains consistently better performance than the prior art
+with sensible ArtiHippo learned continually. To our knowledge, it is the first
+attempt of lifelong learning with ViTs on the challenging VDD benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ONNXExplainer: an ONNX Based Generic Framework to Explain Neural
+  Networks Using Shapley Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Zhao, Runxin He, Nicholas Kersting, Can Liu, Shubham Agrawal, Chiranjeet Chetia, Yu Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding why a neural network model makes certain decisions can be as
+important as the inference performance. Various methods have been proposed to
+help practitioners explain the prediction of a neural network model, of which
+Shapley values are most popular. SHAP package is a leading implementation of
+Shapley values to explain neural networks implemented in TensorFlow or PyTorch
+but lacks cross-platform support, one-shot deployment and is highly
+inefficient. To address these problems, we present the ONNXExplainer, which is
+a generic framework to explain neural networks using Shapley values in the ONNX
+ecosystem. In ONNXExplainer, we develop its own automatic differentiation and
+optimization approach, which not only enables One-Shot Deployment of neural
+networks inference and explanations, but also significantly improves the
+efficiency to compute explanation with less memory consumption. For fair
+comparison purposes, we also implement the same optimization in TensorFlow and
+PyTorch and measure its performance against the current state of the art
+open-source counterpart, SHAP. Extensive benchmarks demonstrate that the
+proposed optimization approach improves the explanation latency of VGG19,
+ResNet50, DenseNet201, and EfficientNetB0 by as much as 500%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RCS-YOLO: A Fast and High-Accuracy Object Detector for Brain Tumor
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16412v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16412v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Kang, Chee-Ming Ting, Fung Fung Ting, Raphaël C. -W. Phan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With an excellent balance between speed and accuracy, cutting-edge YOLO
+frameworks have become one of the most efficient algorithms for object
+detection. However, the performance of using YOLO networks is scarcely
+investigated in brain tumor detection. We propose a novel YOLO architecture
+with Reparameterized Convolution based on channel Shuffle (RCS-YOLO). We
+present RCS and a One-Shot Aggregation of RCS (RCS-OSA), which link feature
+cascade and computation efficiency to extract richer information and reduce
+time consumption. Experimental results on the brain tumor dataset Br35H show
+that the proposed model surpasses YOLOv6, YOLOv7, and YOLOv8 in speed and
+accuracy. Notably, compared with YOLOv7, the precision of RCS-YOLO improves by
+1%, and the inference speed by 60% at 114.8 images detected per second (FPS).
+Our proposed RCS-YOLO achieves state-of-the-art performance on the brain tumor
+detection task. The code is available at https://github.com/mkang315/RCS-YOLO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Certifiers Make Neural Networks Vulnerable to Availability Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.11299v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.11299v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Lorenz, Marta Kwiatkowska, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve reliable, robust, and safe AI systems, it is vital to implement
+fallback strategies when AI predictions cannot be trusted. Certifiers for
+neural networks are a reliable way to check the robustness of these
+predictions. They guarantee for some predictions that a certain class of
+manipulations or attacks could not have changed the outcome. For the remaining
+predictions without guarantees, the method abstains from making a prediction,
+and a fallback strategy needs to be invoked, which typically incurs additional
+costs, can require a human operator, or even fail to provide any prediction.
+While this is a key concept towards safe and secure AI, we show for the first
+time that this approach comes with its own security risks, as such fallback
+strategies can be deliberately triggered by an adversary. In addition to
+naturally occurring abstains for some inputs and perturbations, the adversary
+can use training-time attacks to deliberately trigger the fallback with high
+probability. This transfers the main system load onto the fallback, reducing
+the overall system's integrity and/or availability. We design two novel
+availability attacks, which show the practical relevance of these threats. For
+example, adding 1% poisoned data during training is sufficient to trigger the
+fallback and hence make the model unavailable for up to 100% of all inputs by
+inserting the trigger. Our extensive experiments across multiple datasets,
+model architectures, and certifiers demonstrate the broad applicability of
+these attacks. An initial investigation into potential defenses shows that
+current approaches are insufficient to mitigate the issue, highlighting the
+need for new, specific solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at 16th ACM Workshop on Artificial Intelligence and
+  Security (AISec '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoRAPrune: Pruning Meets Low-Rank Parameter-Efficient Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18403v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18403v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Zhang, Hao Chen, Chunhua Shen, Zhen Yang, Linlin Ou, Xinyi Yu, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained models (LPMs), such as LLaMA and GLM, have shown
+exceptional performance across various tasks through fine-tuning. Although
+low-rank adaption (LoRA) has emerged to cheaply fine-tune these LPMs on
+downstream tasks, their deployment is still hindered by the vast model scale
+and computational costs. Neural network pruning offers a way to compress LPMs.
+However, the current pruning methods designed for LPMs are not compatible with
+LoRA. This is due to their utilization of unstructured pruning on LPMs,
+impeding the merging of LoRA weights, or their dependence on the gradients of
+pre-trained weights to guide pruning, which can impose significant memory
+overhead. To this end, we propose LoRAPrune, a new framework that delivers an
+accurate, compact model for efficient inference in a highly memory-effective
+manner. Specifically, we first design a LoRA-guided pruning criterion, which
+uses the weights and gradients of LoRA, rather than the gradients of
+pre-trained weights for importance estimation. We then propose a structured
+iterative pruning procedure, to remove redundant channels and heads. Extensive
+experimental results demonstrate the superior performance of our LoRAPrune over
+existing approaches on the LLaMA series models. For instance, at a 50\%
+compression rate, LoRAPrune outperforms LLM-Pruner by a perplexity reduction of
+8.0 on WikiText2 and 16.05 on PTB datasets, while concurrently reducing memory
+usage by 52.6\%. The code will be released after review
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Large-scale <span class="highlight-title">Dataset</span> for Audio-Language Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11500v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11500v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luoyi Sun, Xuenan Xu, Mengyue Wu, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The AI community has made significant strides in developing powerful
+foundation models, driven by large-scale multimodal datasets. However, in the
+audio representation learning community, the present audio-language datasets
+suffer from limitations such as insufficient volume, simplistic content, and
+arduous collection procedures. To tackle these challenges, we present an
+innovative and automatic audio caption generation pipeline based on a series of
+public tools or APIs, and construct a large-scale, high-quality, audio-language
+dataset, named as Auto-ACD, comprising over 1.9M audio-text pairs. To
+demonstrate the effectiveness of the proposed dataset, we train popular models
+on our dataset and show performance improvement on various downstream tasks,
+namely, audio-language retrieval, audio captioning, environment classification.
+In addition, we establish a novel test set and provide a benchmark for
+audio-text tasks. The proposed dataset will be released at
+https://auto-acd.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation Metrics for DNNs Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10616v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10616v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abanoub Ghobrial, Samuel Budgett, Dieter Balemans, Hamid Asgari, Phil Reiter, Kerstin Eder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a lot of ongoing research effort into developing different
+techniques for neural networks compression. However, the community lacks
+standardised evaluation metrics, which are key to identifying the most suitable
+compression technique for different applications. This paper reviews existing
+neural network compression evaluation metrics and implements them into a
+standardisation framework called NetZIP. We introduce two novel metrics to
+cover existing gaps of evaluation in the literature: 1) Compression and
+Hardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success
+(OCS). We demonstrate the use of NetZIP using two case studies on two different
+hardware platforms (a PC and a Raspberry Pi 4) focusing on object
+classification and object detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Fully Convolutional Geometric Features for Object 6D Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaime Corsetti, Davide Boscaini, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works on 6D object pose estimation focus on learning keypoint
+correspondences between images and object models, and then determine the object
+pose through RANSAC-based algorithms or by directly regressing the pose with
+end-to-end optimisations. We argue that learning point-level discriminative
+features is overlooked in the literature. To this end, we revisit Fully
+Convolutional Geometric Features (FCGF) and tailor it for object 6D pose
+estimation to achieve state-of-the-art performance. FCGF employs sparse
+convolutions and learns point-level features using a fully-convolutional
+network by optimising a hardest contrastive loss. We can outperform recent
+competitors on popular benchmarks by adopting key modifications to the loss and
+to the input data representations, by carefully tuning the training strategies,
+and by employing data augmentations suitable for the underlying problem. We
+carry out a thorough ablation to study the contribution of each modification.
+The code is available at https://github.com/jcorsetti/FCGF6D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready version, 18 pages and 13 figures. Published at the 8th
+  International Workshop on Recovering 6D Object Pose</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LXL: LiDAR Excluded Lean 3D Object Detection with 4D Imaging Radar and
+  Camera Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00724v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00724v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyi Xiong, Jianan Liu, Tao Huang, Qing-Long Han, Yuxuan Xia, Bing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As an emerging technology and a relatively affordable device, the 4D imaging
+radar has already been confirmed effective in performing 3D object detection in
+autonomous driving. Nevertheless, the sparsity and noisiness of 4D radar point
+clouds hinder further performance improvement, and in-depth studies about its
+fusion with other modalities are lacking. On the other hand, as a new image
+view transformation strategy, "sampling" has been applied in a few image-based
+detectors and shown to outperform the widely applied "depth-based splatting"
+proposed in Lift-Splat-Shoot (LSS), even without image depth prediction.
+However, the potential of "sampling" is not fully unleashed. This paper
+investigates the "sampling" view transformation strategy on the camera and 4D
+imaging radar fusion-based 3D object detection. LiDAR Excluded Lean (LXL)
+model, predicted image depth distribution maps and radar 3D occupancy grids are
+generated from image perspective view (PV) features and radar bird's eye view
+(BEV) features, respectively. They are sent to the core of LXL, called "radar
+occupancy-assisted depth-based sampling", to aid image view transformation. We
+demonstrated that more accurate view transformation can be performed by
+introducing image depths and radar information to enhance the "sampling"
+strategy. Experiments on VoD and TJ4DRadSet datasets show that the proposed
+method outperforms the state-of-the-art 3D object detection methods by a
+significant margin without bells and whistles. Ablation studies demonstrate
+that our method performs the best among different enhancement settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Intelligent Vehicles</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SADIR: Shape-Aware Diffusion Models for 3D Image Reconstruction <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nivetha Jayakumar, Tonmoy Hossain, Miaomiao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D image reconstruction from a limited number of 2D images has been a
+long-standing challenge in computer vision and image analysis. While deep
+learning-based approaches have achieved impressive performance in this area,
+existing deep networks often fail to effectively utilize the shape structures
+of objects presented in images. As a result, the topology of reconstructed
+objects may not be well preserved, leading to the presence of artifacts such as
+discontinuities, holes, or mismatched connections between different parts. In
+this paper, we propose a shape-aware network based on diffusion models for 3D
+image reconstruction, named SADIR, to address these issues. In contrast to
+previous methods that primarily rely on spatial correlations of image
+intensities for 3D reconstruction, our model leverages shape priors learned
+from the training data to guide the reconstruction process. To achieve this, we
+develop a joint learning network that simultaneously learns a mean shape under
+deformation models. Each reconstructed image is then considered as a deformed
+variant of the mean shape. We validate our model, SADIR, on both brain and
+cardiac magnetic resonance images (MRIs). Experimental results show that our
+method outperforms the baselines with lower reconstruction error and better
+preservation of the shape structure of objects within the images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ShapeMI MICCAI 2023: Workshop on Shape in Medical Imaging</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Divide and Conquer in Video Anomaly Detection: A Comprehensive <span class="highlight-title">Review</span>
+  and New Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Xiao, Tianyuan Liu, Genlin Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection is a complex task, and the principle of "divide and
+conquer" is often regarded as an effective approach to tackling intricate
+issues. It's noteworthy that recent methods in video anomaly detection have
+revealed the application of the divide and conquer philosophy (albeit with
+distinct perspectives from traditional usage), yielding impressive outcomes.
+This paper systematically reviews these literatures from six dimensions, aiming
+to enhance the use of the divide and conquer strategy in video anomaly
+detection. Furthermore, based on the insights gained from this review, a novel
+approach is presented, which integrates human skeletal frameworks with video
+data analysis techniques. This method achieves state-of-the-art performance on
+the ShanghaiTech dataset, surpassing all existing advanced methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEPT: Towards Efficient Scene Representation Learning for Motion
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqian Lan, Yuxuan Jiang, Yao Mu, Chen Chen, Shengbo Eben Li, Hang Zhao, Keqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion prediction is crucial for autonomous vehicles to operate safely in
+complex traffic environments. Extracting effective spatiotemporal relationships
+among traffic elements is key to accurate forecasting. Inspired by the
+successful practice of pretrained large language models, this paper presents
+SEPT, a modeling framework that leverages self-supervised learning to develop
+powerful spatiotemporal understanding for complex traffic scenes. Specifically,
+our approach involves three masking-reconstruction modeling tasks on scene
+inputs including agents' trajectories and road network, pretraining the scene
+encoder to capture kinematics within trajectory, spatial structure of road
+network, and interactions among roads and agents. The pretrained encoder is
+then finetuned on the downstream forecasting task. Extensive experiments
+demonstrate that SEPT, without elaborate architectural design or manual feature
+engineering, achieves state-of-the-art performance on the Argoverse 1 and
+Argoverse 2 motion forecasting benchmarks, outperforming previous methods on
+all main metrics by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-City Matters: A Multimodal Remote Sensing Benchmark <span class="highlight-title">Dataset</span> for
+  Cross-City Semantic Segmentation using High-Resolution Domain Adaptation
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16499v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16499v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danfeng Hong, Bing Zhang, Hao Li, Yuxuan Li, Jing Yao, Chenyu Li, Martin Werner, Jocelyn Chanussot, Alexander Zipf, Xiao Xiang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) approaches nowadays have gained remarkable
+success in single-modality-dominated remote sensing (RS) applications,
+especially with an emphasis on individual urban environments (e.g., single
+cities or regions). Yet these AI models tend to meet the performance bottleneck
+in the case studies across cities or regions, due to the lack of diverse RS
+information and cutting-edge solutions with high generalization ability. To
+this end, we build a new set of multimodal remote sensing benchmark datasets
+(including hyperspectral, multispectral, SAR) for the study purpose of the
+cross-city semantic segmentation task (called C2Seg dataset), which consists of
+two cross-city scenes, i.e., Berlin-Augsburg (in Germany) and Beijing-Wuhan (in
+China). Beyond the single city, we propose a high-resolution domain adaptation
+network, HighDAN for short, to promote the AI model's generalization ability
+from the multi-city environments. HighDAN is capable of retaining the spatially
+topological structure of the studied urban scene well in a parallel high-to-low
+resolution fusion fashion but also closing the gap derived from enormous
+differences of RS image representations between different cities by means of
+adversarial learning. In addition, the Dice loss is considered in HighDAN to
+alleviate the class imbalance issue caused by factors across cities. Extensive
+experiments conducted on the C2Seg dataset show the superiority of our HighDAN
+in terms of segmentation performance and generalization ability, compared to
+state-of-the-art competitors. The C2Seg dataset and the semantic segmentation
+toolbox (involving the proposed HighDAN) will be available publicly at
+https://github.com/danfenghong.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Engineering: A Top-Down Approach to AI Transparency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Long Phan, Sarah Chen, James Campbell, Phillip Guo, Richard Ren, Alexander Pan, Xuwang Yin, Mantas Mazeika, Ann-Kathrin Dombrowski, Shashwat Goel, Nathaniel Li, Michael J. Byun, Zifan Wang, Alex Mallen, Steven Basart, Sanmi Koyejo, Dawn Song, Matt Fredrikson, J. Zico Kolter, Dan Hendrycks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we identify and characterize the emerging area of
+representation engineering (RepE), an approach to enhancing the transparency of
+AI systems that draws on insights from cognitive neuroscience. RepE places
+population-level representations, rather than neurons or circuits, at the
+center of analysis, equipping us with novel methods for monitoring and
+manipulating high-level cognitive phenomena in deep neural networks (DNNs). We
+provide baselines and an initial analysis of RepE techniques, showing that they
+offer simple yet effective solutions for improving our understanding and
+control of large language models. We showcase how these methods can provide
+traction on a wide range of safety-relevant problems, including honesty,
+harmlessness, power-seeking, and more, demonstrating the promise of top-down
+transparency research. We hope that this work catalyzes further exploration of
+RepE and fosters advancements in the transparency and safety of AI systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at
+  https://github.com/andyzoujm/representation-engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Seeing is not Believing: An Identity Hider for Human Vision Privacy
+  Protection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00481v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00481v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wang, Yushu Zhang, Zixuan Yang, Hua Zhang, Zhongyun Hua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Massive captured face images are stored in the database for the
+identification of individuals. However, these images can be observed
+intentionally or unintentionally by data managers, which is not at the will of
+individuals and may cause privacy violations. Existing protection schemes can
+maintain identifiability but slightly change the facial appearance, rendering
+it still susceptible to the visual perception of the original identity by data
+managers. In this paper, we propose an effective identity hider for human
+vision protection, which can significantly change appearance to visually hide
+identity while allowing identification for face recognizers. Concretely, the
+identity hider benefits from two specially designed modules: 1) The virtual
+face generation module generates a virtual face with a new appearance by
+manipulating the latent space of StyleGAN2. In particular, the virtual face has
+a similar parsing map to the original face, supporting other vision tasks such
+as head pose detection. 2) The appearance transfer module transfers the
+appearance of the virtual face into the original face via attribute
+replacement. Meanwhile, identity information can be preserved well with the
+help of the disentanglement networks. In addition, diversity and background
+preservation are supported to meet the various requirements. Extensive
+experiments demonstrate that the proposed identity hider achieves excellent
+performance on privacy protection and identifiability preservation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image Clustering via the Principle of Rate Reduction in the Age of
+  <span class="highlight-title">Pretrain</span>ed Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianzhe Chu, Shengbang Tong, Tianjiao Ding, Xili Dai, Benjamin David Haeffele, René Vidal, Yi Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large pre-trained models has brought about a paradigm shift in
+both visual representation learning and natural language processing. However,
+clustering unlabeled images, as a fundamental and classic machine learning
+problem, still lacks an effective solution, particularly for large-scale
+datasets. In this paper, we propose a novel image clustering pipeline that
+leverages the powerful feature representation of large pre-trained models such
+as CLIP and cluster images effectively and efficiently at scale. We first
+developed a novel algorithm to estimate the number of clusters in a given
+dataset. We then show that the pre-trained features are significantly more
+structured by further optimizing the rate reduction objective. The resulting
+features may significantly improve the clustering accuracy, e.g., from 57\% to
+66\% on ImageNet-1k. Furthermore, by leveraging CLIP's multimodality bridge
+between image and text, we develop a simple yet effective self-labeling
+algorithm that produces meaningful text labels for the clusters. Through
+extensive experiments, we show that our pipeline works well on standard
+datasets such as CIFAR-10, CIFAR-100, and ImageNet-1k. It also extends to
+datasets without predefined labels, such as LAION-Aesthetics and WikiArts. We
+released the code in https://github.com/LeslieTrue/CPP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Scene Text Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00410v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00410v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hayato Mitani, Akisato Kimura, Seiichi Uchida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene text removal (STR) is the image transformation task to remove text
+regions in scene images. The conventional STR methods remove all scene text.
+This means that the existing methods cannot select text to be removed. In this
+paper, we propose a novel task setting named selective scene text removal
+(SSTR) that removes only target words specified by the user. Although SSTR is a
+more complex task than STR, the proposed multi-module structure enables
+efficient training for SSTR. Experimental results show that the proposed method
+can remove target words as expected.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, Accepted at the 34th British Machine Vision
+  Conference, code:https://github.com/mitanihayato/Selective-Scene-Text-Removal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NSF: Neural Surface Fields for Human Modeling from Monocular Depth <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14847v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14847v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Xue, Bharat Lal Bhatnagar, Riccardo Marin, Nikolaos Sarafianos, Yuanlu Xu, Gerard Pons-Moll, Tony Tung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining personalized 3D animatable avatars from a monocular camera has
+several real world applications in gaming, virtual try-on, animation, and
+VR/XR, etc. However, it is very challenging to model dynamic and fine-grained
+clothing deformations from such sparse data. Existing methods for modeling 3D
+humans from depth data have limitations in terms of computational efficiency,
+mesh coherency, and flexibility in resolution and topology. For instance,
+reconstructing shapes using implicit functions and extracting explicit meshes
+per frame is computationally expensive and cannot ensure coherent meshes across
+frames. Moreover, predicting per-vertex deformations on a pre-designed human
+template with a discrete surface lacks flexibility in resolution and topology.
+To overcome these limitations, we propose a novel method Neural Surface Fields
+for modeling 3D clothed humans from monocular depth. NSF defines a neural field
+solely on the base surface which models a continuous and flexible displacement
+field. NSF can be adapted to the base surface with different resolution and
+topology without retraining at inference time. Compared to existing approaches,
+our method eliminates the expensive per-frame surface extraction while
+maintaining mesh coherency, and is capable of reconstructing meshes with
+arbitrary resolution without retraining. To foster research in this direction,
+we release our code in project page at: https://yuxuan-xue.com/nsf.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Search-Space Generation Neural Architecture Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Chen, Luming Liang, Tianyu Ding, Ilya Zharkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To search an optimal sub-network within a general deep neural network (DNN),
+existing neural architecture search (NAS) methods typically rely on
+handcrafting a search space beforehand. Such requirements make it challenging
+to extend them onto general scenarios without significant human expertise and
+manual intervention. To overcome the limitations, we propose Automated
+Search-Space Generation Neural Architecture Search (ASGNAS), perhaps the first
+automated system to train general DNNs that cover all candidate connections and
+operations and produce high-performing sub-networks in the one shot manner.
+Technologically, ASGNAS delivers three noticeable contributions to minimize
+human efforts: (i) automated search space generation for general DNNs; (ii) a
+Hierarchical Half-Space Projected Gradient (H2SPG) that leverages the hierarchy
+and dependency within generated search space to ensure the network validity
+during optimization, and reliably produces a solution with both high
+performance and hierarchical group sparsity; and (iii) automated sub-network
+construction upon the H2SPG solution. Numerically, we demonstrate the
+effectiveness of ASGNAS on a variety of general DNNs, including RegNet,
+StackedUnets, SuperResNet, and DARTS, over benchmark datasets such as CIFAR10,
+Fashion-MNIST, ImageNet, STL-10 , and SVNH. The sub-networks computed by ASGNAS
+achieve competitive even superior performance compared to the starting full
+DNNs and other state-of-the-arts. The library will be released at
+https://github.com/tianyic/only_train_once.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Graph visualization for DARTS, SuperResNet are omitted for arXiv
+  version due to exceeding page dimension limit. Please refer to the
+  open-review version for taking the visualizations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Modal Retrieval for Motion and Text via DopTriple Loss <span class="chip">ACM MM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Yan, Yang Liu, Haoqiang Wang, Xin Du, Mengyuan Liu, Hong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval of image-text and video-text is a prominent research
+area in computer vision and natural language processing. However, there has
+been insufficient attention given to cross-modal retrieval between human motion
+and text, despite its wide-ranging applicability. To address this gap, we
+utilize a concise yet effective dual-unimodal transformer encoder for tackling
+this task. Recognizing that overlapping atomic actions in different human
+motion sequences can lead to semantic conflicts between samples, we explore a
+novel triplet loss function called DropTriple Loss. This loss function discards
+false negative samples from the negative sample set and focuses on mining
+remaining genuinely hard negative samples for triplet training, thereby
+reducing violations they cause. We evaluate our model and approach on the
+HumanML3D and KIT Motion-Language datasets. On the latest HumanML3D dataset, we
+achieve a recall of 62.9% for motion retrieval and 71.5% for text retrieval
+(both based on R@10). The source code for our approach is publicly available at
+https://github.com/eanson023/rehamot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by ACM MM Asia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through
+  Unexploitable Data with Learnable Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09241v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09241v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wan Jiang, Yunfeng Diao, He Wang, Jianxin Sun, Meng Wang, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safeguarding data from unauthorized exploitation is vital for privacy and
+security, especially in recent rampant research in security breach such as
+adversarial/membership attacks. To this end, \textit{unlearnable examples}
+(UEs) have been recently proposed as a compelling protection, by adding
+imperceptible perturbation to data so that models trained on them cannot
+classify them accurately on original clean distribution. Unfortunately, we find
+UEs provide a false sense of security, because they cannot stop unauthorized
+users from utilizing other unprotected data to remove the protection, by
+turning unlearnable data into learnable again. Motivated by this observation,
+we formally define a new threat by introducing \textit{learnable unauthorized
+examples} (LEs) which are UEs with their protection removed. The core of this
+approach is a novel purification process that projects UEs onto the manifold of
+LEs. This is realized by a new joint-conditional diffusion model which denoises
+UEs conditioned on the pixel and perceptual similarity between UEs and LEs.
+Extensive experiments demonstrate that LE delivers state-of-the-art countering
+performance against both supervised UEs and unsupervised UEs in various
+scenarios, which is the first generalizable countermeasure to UEs across
+supervised learning and unsupervised learning. Our code is available at
+\url{https://github.com/jiangw-0/LE_JCDP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Discrete and Backpropagation: Straight-Through and Beyond <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Liu, Chengyu Dong, Xiaodong Liu, Bin Yu, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backpropagation, the cornerstone of deep learning, is limited to computing
+gradients for continuous variables. This limitation poses challenges for
+problems involving discrete latent variables. To address this issue, we propose
+a novel approach to approximate the gradient of parameters involved in
+generating discrete latent variables. First, we examine the widely used
+Straight-Through (ST) heuristic and demonstrate that it works as a first-order
+approximation of the gradient. Guided by our findings, we propose ReinMax,
+which achieves second-order accuracy by integrating Heun's method, a
+second-order numerical method for solving ODEs. ReinMax does not require
+Hessian or other second-order derivatives, thus having negligible computation
+overheads. Extensive experimental results on various tasks demonstrate the
+superiority of ReinMax over the state of the art. Implementations are released
+at https://github.com/microsoft/ReinMax.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prototype-guided Cross-modal Completion and Alignment for Incomplete
+  Text-based Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiantian Gong, Guodong Du, Junsheng Wang, Yongkang Ding, Liyan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional text-based person re-identification (ReID) techniques heavily
+rely on fully matched multi-modal data, which is an ideal scenario. However,
+due to inevitable data missing and corruption during the collection and
+processing of cross-modal data, the incomplete data issue is usually met in
+real-world applications. Therefore, we consider a more practical task termed
+the incomplete text-based ReID task, where person images and text descriptions
+are not completely matched and contain partially missing modality data. To this
+end, we propose a novel Prototype-guided Cross-modal Completion and Alignment
+(PCCA) framework to handle the aforementioned issues for incomplete text-based
+ReID. Specifically, we cannot directly retrieve person images based on a text
+query on missing modality data. Therefore, we propose the cross-modal nearest
+neighbor construction strategy for missing data by computing the cross-modal
+similarity between existing images and texts, which provides key guidance for
+the completion of missing modal features. Furthermore, to efficiently complete
+the missing modal features, we construct the relation graphs with the
+aforementioned cross-modal nearest neighbor sets of missing modal data and the
+corresponding prototypes, which can further enhance the generated missing modal
+features. Additionally, for tighter fine-grained alignment between images and
+texts, we raise a prototype-aware cross-modal alignment loss that can
+effectively reduce the modality heterogeneity gap for better fine-grained
+alignment in common space. Extensive experimental results on several benchmarks
+with different missing ratios amply demonstrate that our method can
+consistently outperform state-of-the-art text-image ReID approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Sorry, some collaborators do not agree to publish it on Arxiv, so
+  please withdraw this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiMSam: Diffusion Models as Samplers for Task and Motion Planning under
+  Partial Observability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13196v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13196v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaolin Fang, Caelan Reed Garrett, Clemens Eppner, Tomás Lozano-Pérez, Leslie Pack Kaelbling, Dieter Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task and Motion Planning (TAMP) approaches are effective at planning
+long-horizon autonomous robot manipulation. However, it can be difficult to
+apply them to domains where the environment and its dynamics are not fully
+known. We propose to overcome these limitations by leveraging deep generative
+modeling, specifically diffusion models, to learn constraints and samplers that
+capture these difficult-to-engineer aspects of the planning model. These
+learned samplers are composed and combined within a TAMP solver in order to
+find action parameter values jointly that satisfy the constraints along a plan.
+To tractably make predictions for unseen objects in the environment, we define
+these samplers on low-dimensional learned latent embeddings of changing object
+state. We evaluate our approach in an articulated object manipulation domain
+and show how the combination of classical TAMP, generative learning, and latent
+embeddings enables long-horizon constraint-based reasoning. We also apply the
+learned sampler in the real world. More details are available at
+https://sites.google.com/view/dimsam-tamp
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alt-Text with Context: Improving Accessibility for Images on Twitter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14779v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14779v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Srivatsan, Sofia Samaniego, Omar Florez, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we present an approach for generating alternative text (or
+alt-text) descriptions for images shared on social media, specifically Twitter.
+More than just a special case of image captioning, alt-text is both more
+literally descriptive and context-specific. Also critically, images posted to
+Twitter are often accompanied by user-written text that despite not necessarily
+describing the image may provide useful context that if properly leveraged can
+be informative. We address this task with a multimodal model that conditions on
+both textual information from the associated social media post as well as
+visual signal from the image, and demonstrate that the utility of these two
+information sources stacks. We put forward a new dataset of 371k images paired
+with alt-text and tweets scraped from Twitter and evaluate on it across a
+variety of automated metrics as well as human evaluation. We show that our
+approach of conditioning on both tweet text and visual information
+significantly outperforms prior work, by more than 2x on BLEU@4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Floorplan Restoration by Structure Hallucinating <span class="highlight-title">Transformer</span> Cascades <span class="chip">BMVC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.00645v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.00645v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sepidehsadat Hosseini, Yasutaka Furukawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an extreme floorplan reconstruction task, a new benchmark
+for the task, and a neural architecture as a solution. Given a partial
+floorplan reconstruction inferred or curated from panorama images, the task is
+to reconstruct a complete floorplan including invisible architectural
+structures. The proposed neural network 1) encodes an input partial floorplan
+into a set of latent vectors by convolutional neural networks and a
+Transformer; and 2) reconstructs an entire floorplan while hallucinating
+invisible rooms and doors by cascading Transformer decoders. Qualitative and
+quantitative evaluations demonstrate effectiveness of our approach over the
+benchmark of 701 houses, outperforming the state-of-the-art reconstruction
+techniques. We will share our code, models, and data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at BMVC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PuzzleFusion: Unleashing the Power of Diffusion Models for Spatial
+  Puzzle Solving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13785v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13785v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sepidehsadat Hosseini, Mohammad Amin Shabani, Saghar Irandoust, Yasutaka Furukawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an end-to-end neural architecture based on Diffusion
+Models for spatial puzzle solving, particularly jigsaw puzzle and room
+arrangement tasks. In the latter task, for instance, the proposed system
+"PuzzleFusion" takes a set of room layouts as polygonal curves in the top-down
+view and aligns the room layout pieces by estimating their 2D translations and
+rotations, akin to solving the jigsaw puzzle of room layouts. A surprising
+discovery of the paper is that the simple use of a Diffusion Model effectively
+solves these challenging spatial puzzle tasks as a conditional generation
+process. To enable learning of an end-to-end neural system, the paper
+introduces new datasets with ground-truth arrangements: 1) 2D Voronoi jigsaw
+dataset, a synthetic one where pieces are generated by Voronoi diagram of 2D
+pointset; and 2) MagicPlan dataset, a real one offered by MagicPlan from its
+production pipeline, where pieces are room layouts constructed by augmented
+reality App by real-estate consumers. The qualitative and quantitative
+evaluations demonstrate that our approach outperforms the competing methods by
+significant margins in all the tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shortcut-V2V: Compression Framework for Video-to-Video Translation based
+  on Temporal Redundancy Reduction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08011v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08011v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaeyeon Chung, Yeojeong Park, Seunghwan Choi, Munkhsoyol Ganbat, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-to-video translation aims to generate video frames of a target domain
+from an input video. Despite its usefulness, the existing networks require
+enormous computations, necessitating their model compression for wide use.
+While there exist compression methods that improve computational efficiency in
+various image/video tasks, a generally-applicable compression method for
+video-to-video translation has not been studied much. In response, we present
+Shortcut-V2V, a general-purpose compression framework for video-to-video
+translation. Shourcut-V2V avoids full inference for every neighboring video
+frame by approximating the intermediate features of a current frame from those
+of the previous frame. Moreover, in our framework, a newly-proposed block
+called AdaBD adaptively blends and deforms features of neighboring frames,
+which makes more accurate predictions of the intermediate features possible. We
+conduct quantitative and qualitative evaluations using well-known
+video-to-video translation models on various tasks to demonstrate the general
+applicability of our framework. The results show that Shourcut-V2V achieves
+comparable performance compared to the original video-to-video translation
+model while saving 3.2-5.7x computational cost and 7.8-44x memory at test time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Skin Tone: A Multidimensional Measure of Apparent Skin Color <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Thong, Przemyslaw Joniak, Alice Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper strives to measure apparent skin color in computer vision, beyond
+a unidimensional scale on skin tone. In their seminal paper Gender Shades,
+Buolamwini and Gebru have shown how gender classification systems can be biased
+against women with darker skin tones. Subsequently, fairness researchers and
+practitioners have adopted the Fitzpatrick skin type classification as a common
+measure to assess skin color bias in computer vision systems. While effective,
+the Fitzpatrick scale only focuses on the skin tone ranging from light to dark.
+Towards a more comprehensive measure of skin color, we introduce the hue angle
+ranging from red to yellow. When applied to images, the hue dimension reveals
+additional biases related to skin color in both computer vision datasets and
+models. We then recommend multidimensional skin color scales, relying on both
+skin tone and hue, for fairness assessments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the International Conference on Computer Vision (ICCV)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparative study of Deep Learning Models for Binary Classification on
+  Combined Pulmonary Chest X-ray <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shabbir Ahmed Shuvo, Md Aminul Islam, Md. Mozammel Hoque, Rejwan Bin Sulaiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CNN-based deep learning models for disease detection have become popular
+recently. We compared the binary classification performance of eight prominent
+deep learning models: DenseNet 121, DenseNet 169, DenseNet 201, EffecientNet
+b0, EffecientNet lite4, GoogleNet, MobileNet, and ResNet18 for their binary
+classification performance on combined Pulmonary Chest Xrays dataset. Despite
+the widespread application in different fields in medical images, there remains
+a knowledge gap in determining their relative performance when applied to the
+same dataset, a gap this study aimed to address. The dataset combined Shenzhen,
+China (CH) and Montgomery, USA (MC) data. We trained our model for binary
+classification, calculated different parameters of the mentioned models, and
+compared them. The models were trained to keep in mind all following the same
+training parameters to maintain a controlled comparison environment. End of the
+study, we found a distinct difference in performance among the other models
+when applied to the pulmonary chest Xray image dataset, where DenseNet169
+performed with 89.38 percent and MobileNet with 92.2 percent precision.
+  Keywords: Pulmonary, Deep Learning, Tuberculosis, Disease detection, Xray
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeFormer: Integrating <span class="highlight-title">Transformer</span>s with Deformable Models for 3D Shape
+  Abstraction from a Single Image <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Liu, Xiang Yu, Meng Ye, Qilong Zhangli, Zhuowei Li, Zhixing Zhang, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D shape abstraction from a single 2D image is a long-standing
+problem in computer vision and graphics. By leveraging a set of primitives to
+represent the target shape, recent methods have achieved promising results.
+However, these methods either use a relatively large number of primitives or
+lack geometric flexibility due to the limited expressibility of the primitives.
+In this paper, we propose a novel bi-channel Transformer architecture,
+integrated with parameterized deformable models, termed DeFormer, to
+simultaneously estimate the global and local deformations of primitives. In
+this way, DeFormer can abstract complex object shapes while using a small
+number of primitives which offer a broader geometry coverage and finer details.
+Then, we introduce a force-driven dynamic fitting and a cycle-consistent
+re-projection loss to optimize the primitive parameters. Extensive experiments
+on ShapeNet across various settings show that DeFormer achieves better
+reconstruction accuracy over the state-of-the-art, and visualizes with
+consistent semantic correspondences for improved interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wise-SrNet: A Novel Architecture for Enhancing Image Classification by
+  Learning Spatial Resolution of Feature Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.12294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.12294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Rahimzadeh, AmirAli Askari, Soroush Parvin, Elnaz Safi, Mohammad Reza Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main challenges since the advancement of convolutional neural
+networks is how to connect the extracted feature map to the final
+classification layer. VGG models used two sets of fully connected layers for
+the classification part of their architectures, which significantly increased
+the number of models' weights. ResNet and the next deep convolutional models
+used the Global Average Pooling (GAP) layer to compress the feature map and
+feed it to the classification layer. Although using the GAP layer reduces the
+computational cost, but also causes losing spatial resolution of the feature
+map, which results in decreasing learning efficiency. In this paper, we aim to
+tackle this problem by replacing the GAP layer with a new architecture called
+Wise-SrNet. It is inspired by the depthwise convolutional idea and is designed
+for processing spatial resolution while not increasing computational cost. We
+have evaluated our method using three different datasets: Intel Image
+Classification Challenge, MIT Indoors Scenes, and a part of the ImageNet
+dataset. We investigated the implementation of our architecture on several
+models of the Inception, ResNet, and DenseNet families. Applying our
+architecture has revealed a significant effect on increasing convergence speed
+and accuracy. Our Experiments on images with 224*224 resolution increased the
+Top-1 accuracy between 2% to 8% on different datasets and models. Running our
+models on 512*512 resolution images of the MIT Indoors Scenes dataset showed a
+notable result of improving the Top-1 accuracy within 3% to 26%. We will also
+demonstrate the GAP layer's disadvantage when the input images are large and
+the number of classes is not few. In this circumstance, our proposed
+architecture can do a great help in enhancing classification results. The code
+is shared at https://github.com/mr7495/image-classification-spatial.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is shared at
+  https://github.com/mr7495/image-classification-spatial</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A novel approach to generate <span class="highlight-title">dataset</span>s with XAI ground truth to evaluate
+  image models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miquel Miró-Nicolau, Antoni Jaume-i-Capó, Gabriel Moyà-Alcover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increased usage of artificial intelligence (AI), it is imperative to
+understand how these models work internally. These needs have led to the
+development of a new field called eXplainable artificial intelligence (XAI).
+This field consists of on a set of techniques that allows us to theoretically
+determine the cause of the AI decisions. One main issue of XAI is how to verify
+the works on this field, taking into consideration the lack of ground truth
+(GT). In this study, we propose a new method to generate datasets with GT. We
+conducted a set of experiments that compared our GT with real model
+explanations and obtained excellent results confirming that our proposed method
+is correct.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Representations on the Unit Sphere: Investigating Angular
+  Gaussian and von Mises-Fisher Distributions for Online Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Michel, Giovanni Chierchia, Romain Negrel, Jean-François Bercher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use the maximum a posteriori estimation principle for learning
+representations distributed on the unit sphere. We propose to use the angular
+Gaussian distribution, which corresponds to a Gaussian projected on the
+unit-sphere and derive the associated loss function. We also consider the von
+Mises-Fisher distribution, which is the conditional of a Gaussian in the
+unit-sphere. The learned representations are pushed toward fixed directions,
+which are the prior means of the Gaussians; allowing for a learning strategy
+that is resilient to data drift. This makes it suitable for online continual
+learning, which is the problem of training neural networks on a continuous data
+stream, where multiple classification tasks are presented sequentially so that
+data from past tasks are no longer accessible, and data from the current task
+can be seen only once. To address this challenging scenario, we propose a
+memory-based representation learning technique equipped with our new loss
+functions. Our approach does not require negative data or knowledge of task
+boundaries and performs well with smaller batch sizes while being
+computationally efficient. We demonstrate with extensive experiments that the
+proposed method outperforms the current state-of-the-art methods on both
+standard evaluation scenarios and realistic scenarios with blurry task
+boundaries. For reproducibility, we use the same training pipeline for every
+compared method and share the code at https://t.ly/SQTj.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Sparse Training with Structured Sparsity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Lasby, Anna Golubeva, Utku Evci, Mihai Nica, Yani Ioannou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic Sparse Training (DST) methods achieve state-of-the-art results in
+sparse neural network training, matching the generalization of dense models
+while enabling sparse training and inference. Although the resulting models are
+highly sparse and theoretically less computationally expensive, achieving
+speedups with unstructured sparsity on real-world hardware is challenging. In
+this work, we propose a sparse-to-sparse DST method, Structured RigL (SRigL),
+to learn a variant of fine-grained structured N:M sparsity by imposing a
+constant fan-in constraint. Using our empirical analysis of existing DST
+methods at high sparsity, we additionally employ a neuron ablation method which
+enables SRigL to achieve state-of-the-art sparse-to-sparse structured DST
+performance on a variety of Neural Network (NN) architectures. We demonstrate
+reduced real-world timings on CPU for online inference -- 3.6x/2x faster at 90%
+sparsity than equivalent dense/unstructured sparse layers, respectively. Our
+source code is available at https://github.com/calgaryml/condensed-sparsity
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDP: Parameter-free Differentiable Pruning is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11203v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11203v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsik Cho, Saurabh Adya, Devang Naik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DNN pruning is a popular way to reduce the size of a model, improve the
+inference latency, and minimize the power consumption on DNN accelerators.
+However, existing approaches might be too complex, expensive or ineffective to
+apply to a variety of vision/language tasks, DNN architectures and to honor
+structured pruning constraints. In this paper, we propose an efficient yet
+effective train-time pruning scheme, Parameter-free Differentiable Pruning
+(PDP), which offers state-of-the-art qualities in model size, accuracy, and
+training cost. PDP uses a dynamic function of weights during training to
+generate soft pruning masks for the weights in a parameter-free manner for a
+given pruning target. While differentiable, the simplicity and efficiency of
+PDP make it universal enough to deliver state-of-the-art
+random/structured/channel pruning results on various vision and natural
+language tasks. For example, for MobileNet-v1, PDP can achieve 68.2% top-1
+ImageNet1k accuracy at 86.6% sparsity, which is 1.7% higher accuracy than those
+from the state-of-the-art algorithms. Also, PDP yields over 83.1% accuracy on
+Multi-Genre Natural Language Inference with 90% sparsity for BERT, while the
+next best from the existing techniques shows 81.5% accuracy. In addition, PDP
+can be applied to structured pruning, such as N:M pruning and channel pruning.
+For 1:4 structured pruning of ResNet18, PDP improved the top-1 ImageNet1k
+accuracy by over 3.6% over the state-of-the-art. For channel pruning of
+ResNet50, PDP reduced the top-1 ImageNet1k accuracy by 0.6% from the
+state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CORec-Cri: How collaborative and social technologies can help to
+  contextualize crises? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Luyen Le, Jinfeng Zhong, Elsa Negre, Marie-Hélène Abel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crisis situations can present complex and multifaceted challenges, often
+requiring the involvement of multiple organizations and stakeholders with
+varying areas of expertise, responsibilities, and resources. Acquiring accurate
+and timely information about impacted areas is crucial to effectively respond
+to these crises. In this paper, we investigate how collaborative and social
+technologies help to contextualize crises, including identifying impacted areas
+and real-time needs. To this end, we define CORec-Cri (Contextulized
+Ontology-based Recommender system for crisis management) based on existing
+work. Our motivation for this approach is two-fold: first, effective
+collaboration among stakeholders is essential for efficient and coordinated
+crisis response; second, social computing facilitates interaction, information
+flow, and collaboration among stakeholders. We detail the key components of our
+system design, highlighting its potential to support decision-making, resource
+allocation, and communication among stakeholders. Finally, we provide examples
+of how our system can be applied to contextualize crises to improve crisis
+management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Multimedia Verification with Computational Tools and OSINT:
+  Russia-Ukraine Conflict Case Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sohail Ahmed Khan, Jan Gunnar Furuly, Henrik Brattli Vold, Rano Tahseen, Duc-Tien Dang-Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the use of computational tools and Open-Source
+Intelligence (OSINT) techniques for verifying online multimedia content, with a
+specific focus on real-world cases from the Russia-Ukraine conflict. Over a
+nine-month period from April to December 2022, we examine verification
+workflows, tools, and case studies published by \faktiskbar. Our study
+showcases the effectiveness of diverse resources, including AI tools,
+geolocation tools, internet archives, and social media monitoring platforms, in
+enabling journalists and fact-checkers to efficiently process and corroborate
+evidence, ensuring the dissemination of accurate information. This research
+underscores the vital role of computational tools and OSINT techniques in
+promoting evidence-based reporting and combatting misinformation. We also touch
+on the current limitations of available tools and prospects for future
+developments in multimedia verification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Recurrent Units for Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenrui Yue, Yueqi Wang, Zhankui He, Huimin Zeng, Julian McAuley, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art sequential recommendation relies heavily on
+self-attention-based recommender models. Yet such models are computationally
+expensive and often too slow for real-time recommendation. Furthermore, the
+self-attention operation is performed at a sequence-level, thereby making
+low-cost incremental inference challenging. Inspired by recent advances in
+efficient language modeling, we propose linear recurrent units for sequential
+recommendation (LRURec). Similar to recurrent neural networks, LRURec offers
+rapid inference and can achieve incremental inference on sequential inputs. By
+decomposing the linear recurrence operation and designing recursive
+parallelization in our framework, LRURec provides the additional benefits of
+reduced model size and parallelizable training. Moreover, we optimize the
+architecture of LRURec by implementing a series of modifications to address the
+lack of non-linearity and improve training dynamics. To validate the
+effectiveness of our proposed LRURec, we conduct extensive experiments on
+multiple real-world datasets and compare its performance against
+state-of-the-art sequential recommenders. Experimental results demonstrate the
+effectiveness of LRURec, which consistently outperforms baselines by a
+significant margin. Results also highlight the efficiency of LRURec with our
+parallelized training paradigm and fast inference on long sequences, showing
+its potential to further enhance user experience in sequential recommendation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond-Accuracy: A <span class="highlight-title">Review</span> on Diversity, Serendipity and Fairness in
+  Recommender Systems Based on Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomislav Duricic, Dominik Kowald, Emanuel Lacic, Elisabeth Lex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By providing personalized suggestions to users, recommender systems have
+become essential to numerous online platforms. Collaborative filtering,
+particularly graph-based approaches using Graph Neural Networks (GNNs), have
+demonstrated great results in terms of recommendation accuracy. However,
+accuracy may not always be the most important criterion for evaluating
+recommender systems' performance, since beyond-accuracy aspects such as
+recommendation diversity, serendipity, and fairness can strongly influence user
+engagement and satisfaction. This review paper focuses on addressing these
+dimensions in GNN-based recommender systems, going beyond the conventional
+accuracy-centric perspective. We begin by reviewing recent developments in
+approaches that improve not only the accuracy-diversity trade-off but also
+promote serendipity and fairness in GNN-based recommender systems. We discuss
+different stages of model development including data preprocessing, graph
+construction, embedding initialization, propagation layers, embedding fusion,
+score computation, and training methodologies. Furthermore, we present a look
+into the practical difficulties encountered in assuring diversity, serendipity,
+and fairness, while retaining high accuracy. Finally, we discuss potential
+future research directions for developing more robust GNN-based recommender
+systems that go beyond the unidimensional perspective of focusing solely on
+accuracy. This review aims to provide researchers and practitioners with an
+in-depth understanding of the multifaceted issues that arise when designing
+GNN-based recommender systems, setting our work apart by offering a
+comprehensive exploration of beyond-accuracy dimensions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 figure, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Reinforcement Learning Approach for Interactive Search with
+  Sentence-level Feedback <span class="chip">CIKM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghong Zhou, Joyce C. Ho, Chen Lin, Eugene Agichtein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive search can provide a better experience by incorporating
+interaction feedback from the users. This can significantly improve search
+accuracy as it helps avoid irrelevant information and captures the users'
+search intents. Existing state-of-the-art (SOTA) systems use reinforcement
+learning (RL) models to incorporate the interactions but focus on item-level
+feedback, ignoring the fine-grained information found in sentence-level
+feedback. Yet such feedback requires extensive RL action space exploration and
+large amounts of annotated data. This work addresses these challenges by
+proposing a new deep Q-learning (DQ) approach, DQrank. DQrank adapts BERT-based
+models, the SOTA in natural language processing, to select crucial sentences
+based on users' engagement and rank the items to obtain more satisfactory
+responses. We also propose two mechanisms to better explore optimal actions.
+DQrank further utilizes the experience replay mechanism in DQ to store the
+feedback sentences to obtain a better initial ranking performance. We validate
+the effectiveness of DQrank on three search datasets. The results show that
+DQRank performs at least 12% better than the previous SOTA RL approaches. We
+also conduct detailed ablation studies. The ablation results demonstrate that
+each model component can efficiently extract and accumulate long-term
+engagement effects from the users' sentence-level feedback. This structure
+offers new technologies with promised performance to construct a search system
+with sentence-level interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures, DRL4IR@CIKM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Correction Strategy for Ranking Distillation in Top-N Recommender
+  System <span class="chip">CIKM 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.03459v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.03459v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngjune Lee, Kee-Eung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation (KD), which transfers the knowledge of a well-trained
+large model (teacher) to a small model (student), has become an important area
+of research for practical deployment of recommender systems. Recently, Relaxed
+Ranking Distillation (RRD) has shown that distilling the ranking information in
+the recommendation list significantly improves the performance. However, the
+method still has limitations in that 1) it does not fully utilize the
+prediction errors of the student model, which makes the training not fully
+efficient, and 2) it only distills the user-side ranking information, which
+provides an insufficient view under the sparse implicit feedback. This paper
+presents Dual Correction strategy for Distillation (DCD), which transfers the
+ranking information from the teacher model to the student model in a more
+efficient manner. Most importantly, DCD uses the discrepancy between the
+teacher model and the student model predictions to decide which knowledge to be
+distilled. By doing so, DCD essentially provides the learning guidance tailored
+to "correcting" what the student model has failed to accurately predict. This
+process is applied for transferring the ranking information from the user-side
+as well as the item-side to address sparse implicit user feedback. Our
+experiments show that the proposed method outperforms the state-of-the-art
+baselines, and ablation studies validate the effectiveness of each component.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concentrating on the Impact: Consequence-based Explanations in
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Lubos, Thi Ngoc Trang Tran, Seda Polat Erdeniz, Merfat El Mansi, Alexander Felfernig, Manfred Wundara, Gerhard Leitner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems assist users in decision-making, where the presentation
+of recommended items and their explanations are critical factors for enhancing
+the overall user experience. Although various methods for generating
+explanations have been proposed, there is still room for improvement,
+particularly for users who lack expertise in a specific item domain. In this
+study, we introduce the novel concept of \textit{consequence-based
+explanations}, a type of explanation that emphasizes the individual impact of
+consuming a recommended item on the user, which makes the effect of following
+recommendations clearer. We conducted an online user study to examine our
+assumption about the appreciation of consequence-based explanations and their
+impacts on different explanation aims in recommender systems. Our findings
+highlight the importance of consequence-based explanations, which were
+well-received by users and effectively improved user satisfaction in
+recommender systems. These results provide valuable insights for designing
+engaging explanations that can enhance the overall user experience in
+decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper to be presented at IntRS'23: Joint Workshop on
+  Interfaces and Human Decision Making for Recommender Systems, September 18,
+  2023, Singapore. paper will be published in the workshop proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangled Contrastive Learning for Social Recommendation <span class="chip">CIKM2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.08723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.08723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Wu, Wenqi Fan, Jingfan Chen, Shengcai Liu, Qing Li, Ke Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social recommendations utilize social relations to enhance the representation
+learning for recommendations. Most social recommendation models unify user
+representations for the user-item interactions (collaborative domain) and
+social relations (social domain). However, such an approach may fail to model
+the users heterogeneous behavior patterns in two domains, impairing the
+expressiveness of user representations. In this work, to address such
+limitation, we propose a novel Disentangled contrastive learning framework for
+social Recommendations DcRec. More specifically, we propose to learn
+disentangled users representations from the item and social domains. Moreover,
+disentangled contrastive learning is designed to perform knowledge transfer
+between disentangled users representations for social recommendations.
+Comprehensive experiments on various real-world datasets demonstrate the
+superiority of our proposed model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DREAM: Visual Decoding from Reversing Human Visual System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Xia, Raoul de Charette, Cengiz Öztireli, Jing-Hao Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we present DREAM, an fMRI-to-image method for reconstructing
+viewed images from brain activities, grounded on fundamental knowledge of the
+human visual system. We craft reverse pathways that emulate the hierarchical
+and parallel nature of how humans perceive the visual world. These tailored
+pathways are specialized to decipher semantics, color, and depth cues from fMRI
+data, mirroring the forward pathways from visual stimuli to fMRI recordings. To
+do so, two components mimic the inverse processes within the human visual
+system: the Reverse Visual Association Cortex (R-VAC) which reverses pathways
+of this brain region, extracting semantics from fMRI data; the Reverse Parallel
+PKM (R-PKM) component simultaneously predicting color and depth from fMRI
+signals. The experiments indicate that our method outperforms the current
+state-of-the-art models in terms of the consistency of appearance, structure,
+and semantics. Code will be made publicly available to facilitate further
+research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://weihaox.github.io/DREAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Post-training Large Language Models on Data Curriculum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canwen Xu, Corby Rosset, Luciano Del Corro, Shweti Mahajan, Julian McAuley, Jennifer Neville, Ahmed Hassan Awadallah, Nikhil Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alignment serves as an important step to steer large language models (LLMs)
+towards human preferences. In this paper, we explore contrastive post-training
+techniques for alignment by automatically constructing preference pairs from
+multiple models of varying strengths (e.g., InstructGPT, ChatGPT and GPT-4). We
+carefully compare the contrastive techniques of SLiC and DPO to SFT baselines
+and find that DPO provides a step-function improvement even after continueing
+SFT saturates. We also explore a data curriculum learning scheme for
+contrastive post-training, which starts by learning from "easier" pairs and
+transitioning to "harder" ones, which further improves alignment. Finally, we
+scale up our experiments to train with more data and larger models like Orca.
+Remarkably, contrastive post-training further improves the performance of Orca,
+already a state-of-the-art instruction learning model tuned with GPT-4 outputs,
+to exceed that of ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizable Long-Horizon Manipulations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhou, Mingyu Ding, Weikun Peng, Masayoshi Tomizuka, Lin Shao, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a framework harnessing the capabilities of Large
+Language Models (LLMs) to generate primitive task conditions for generalizable
+long-horizon manipulations with novel objects and unseen tasks. These task
+conditions serve as guides for the generation and adjustment of Dynamic
+Movement Primitives (DMP) trajectories for long-horizon task execution. We
+further create a challenging robotic manipulation task suite based on Pybullet
+for long-horizon task evaluation. Extensive experiments in both simulated and
+real-world environments demonstrate the effectiveness of our framework on both
+familiar tasks involving new objects and novel but related tasks, highlighting
+the potential of LLMs in enhancing robotic system versatility and adaptability.
+Project website: https://object814.github.io/Task-Condition-With-LLM/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Neural Scaling Law from Lottery Ticket Ensembling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziming Liu, Max Tegmark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural scaling laws (NSL) refer to the phenomenon where model performance
+improves with scale. Sharma & Kaplan analyzed NSL using approximation theory
+and predict that MSE losses decay as $N^{-\alpha}$, $\alpha=4/d$, where $N$ is
+the number of model parameters, and $d$ is the intrinsic input dimension.
+Although their theory works well for some cases (e.g., ReLU networks), we
+surprisingly find that a simple 1D problem $y=x^2$ manifests a different
+scaling law ($\alpha=1$) from their predictions ($\alpha=4$). We opened the
+neural networks and found that the new scaling law originates from lottery
+ticket ensembling: a wider network on average has more "lottery tickets", which
+are ensembled to reduce the variance of outputs. We support the ensembling
+mechanism by mechanistically interpreting single neural networks, as well as
+studying them statistically. We attribute the $N^{-1}$ scaling law to the
+"central limit theorem" of lottery tickets. Finally, we discuss its potential
+implications for large language models and statistical physics-type theories of
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathVista: Evaluating Mathematical Reasoning of Foundation Models in
+  Visual Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Large Language Models (LLMs) and Large Multimodal Models (LMMs)
+exhibit impressive skills in various domains, their ability for mathematical
+reasoning within visual contexts has not been formally examined. Equipping LLMs
+and LMMs with this capability is vital for general-purpose AI assistants and
+showcases promising potential in education, data analysis, and scientific
+discovery. To bridge this gap, we present MathVista, a benchmark designed to
+amalgamate challenges from diverse mathematical and visual tasks. We first
+taxonomize the key task types, reasoning skills, and visual contexts from the
+literature to guide our selection from 28 existing math-focused and visual
+question answering datasets. Then, we construct three new datasets, IQTest,
+FunctionQA, and PaperQA, to accommodate for missing types of visual contexts.
+The problems featured often require deep visual understanding beyond OCR or
+image captioning, and compositional reasoning with rich domain-specific tools,
+thus posing a notable challenge to existing models. We conduct a comprehensive
+evaluation of 11 prominent open-source and proprietary foundation models (LLMs,
+LLMs augmented with tools, and LMMs), and early experiments with GPT-4V. The
+best-performing model, Multimodal Bard, achieves only 58% of human performance
+(34.8% vs 60.3%), indicating ample room for further improvement. Given this
+significant gap, MathVista fuels future research in the development of
+general-purpose AI agents capable of tackling mathematically intensive and
+visually rich real-world tasks. Preliminary tests show that MathVista also
+presents challenges to GPT-4V, underscoring the benchmark's importance. The
+project is available at https://mathvista.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 56 figures. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning unitaries with quantum statistical queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armando Angrisani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose several algorithms for learning unitary operators from quantum
+statistical queries (QSQs) with respect to their Choi-Jamiolkowski state.
+Quantum statistical queries capture the capabilities of a learner with limited
+quantum resources, which receives as input only noisy estimates of expected
+values of measurements. Our methods hinge on a novel technique for estimating
+the Fourier mass of a unitary on a subset of Pauli strings with a single
+quantum statistical query, generalizing a previous result for uniform quantum
+examples. Exploiting this insight, we show that the quantum Goldreich-Levin
+algorithm can be implemented with quantum statistical queries, whereas the
+prior version of the algorithm involves oracle access to the unitary and its
+inverse. Moreover, we prove that $\mathcal{O}(\log n)$-juntas and quantum
+Boolean functions with constant total influence are efficiently learnable in
+our model, and constant-depth circuits are learnable sample-efficiently with
+quantum statistical queries. On the other hand, all previous algorithms for
+these tasks require direct access to the Choi-Jamiolkowski state or oracle
+access to the unitary. In addition, our upper bounds imply that the actions of
+those classes of unitaries on locally scrambled ensembles can be efficiently
+learned. We also demonstrate that, despite these positive results, quantum
+statistical queries lead to an exponentially larger sample complexity for
+certain tasks, compared to separable measurements to the Choi-Jamiolkowski
+state. In particular, we show an exponential lower bound for learning a class
+of phase-oracle unitaries and a double exponential lower bound for testing the
+unitarity of channels, adapting to our setting previous arguments for quantum
+states. Finally, we propose a new definition of average-case surrogate models,
+showing a potential application of our results to hybrid quantum machine
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why do autoencoders work? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew D. Kvalheim, Eduardo D. Sontag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network autoencoders are routinely used computationally for model
+reduction. They allow recognizing the intrinsic dimension of data that lie in a
+$k$-dimensional subset $K$ of an input Euclidean space $\R^n$. The underlying
+idea is to obtain both an encoding layer that maps $\R^n$ into $\R^k$ (called
+the bottleneck layer or the space of latent variables) and a decoding layer
+that maps $\R^k$ back into $\R^n$, in such a way that the input data from the
+set $K$ is recovered when composing the two maps. This is achieved by adjusting
+parameters (weights) in the network to minimize the discrepancy between the
+input and the reconstructed output. Since neural networks (with continuous
+activation functions) compute continuous maps, the existence of a network that
+achieves perfect reconstruction would imply that $K$ is homeomorphic to a
+$k$-dimensional subset of $\R^k$, so clearly there are topological obstructions
+to finding such a network. On the other hand, in practice the technique is
+found to ``work'' well, which leads one to ask if there is a way to explain
+this effectiveness. We show that, up to small errors, indeed the method is
+guaranteed to work. This is done by appealing to certain facts from
+differential geometry. A computational example is also included to illustrate
+the ideas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing <span class="highlight-title">Pre-Train</span>ed Sentence <span class="highlight-title">Transformer</span>s for Offensive Language
+  Detection in Indian Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ananya Joshi, Raviraj Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In our increasingly interconnected digital world, social media platforms have
+emerged as powerful channels for the dissemination of hate speech and offensive
+content. This work delves into the domain of hate speech detection, placing
+specific emphasis on three low-resource Indian languages: Bengali, Assamese,
+and Gujarati. The challenge is framed as a text classification task, aimed at
+discerning whether a tweet contains offensive or non-offensive content.
+Leveraging the HASOC 2023 datasets, we fine-tuned pre-trained BERT and SBERT
+models to evaluate their effectiveness in identifying hate speech. Our findings
+underscore the superiority of monolingual sentence-BERT models, particularly in
+the Bengali language, where we achieved the highest ranking. However, the
+performance in Assamese and Gujarati languages signifies ongoing opportunities
+for enhancement. Our goal is to foster inclusive online spaces by countering
+hate speech proliferation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>HASOC at FIRE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Relax: Setting Solver Parameters Across a Sequence of Linear
+  System Instances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Khodak, Edmond Chow, Maria-Florina Balcan, Ameet Talwalkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving a linear system $Ax=b$ is a fundamental scientific computing
+primitive for which numerous solvers and preconditioners have been developed.
+These come with parameters whose optimal values depend on the system being
+solved and are often impossible or too expensive to identify; thus in practice
+sub-optimal heuristics are used. We consider the common setting in which many
+related linear systems need to be solved, e.g. during a single numerical
+simulation. In this scenario, can we sequentially choose parameters that attain
+a near-optimal overall number of iterations, without extra matrix computations?
+We answer in the affirmative for Successive Over-Relaxation (SOR), a standard
+solver whose parameter $\omega$ has a strong impact on its runtime. For this
+method, we prove that a bandit online learning algorithm -- using only the
+number of iterations as feedback -- can select parameters for a sequence of
+instances such that the overall cost approaches that of the best fixed $\omega$
+as the sequence length increases. Furthermore, when given additional structural
+information, we show that a contextual bandit method asymptotically achieves
+the performance of the instance-optimal policy, which selects the best $\omega$
+for each instance. Our work provides the first learning-theoretic treatment of
+high-precision linear system solvers and the first end-to-end guarantees for
+data-driven scientific computing, demonstrating theoretically the potential to
+speed up numerical methods using well-understood learning algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning quantum Hamiltonians at any temperature in polynomial time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ainesh Bakshi, Allen Liu, Ankur Moitra, Ewin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of learning a local quantum Hamiltonian $H$ given copies
+of its Gibbs state $\rho = e^{-\beta H}/\textrm{tr}(e^{-\beta H})$ at a known
+inverse temperature $\beta>0$. Anshu, Arunachalam, Kuwahara, and Soleimanifar
+(arXiv:2004.07266) gave an algorithm to learn a Hamiltonian on $n$ qubits to
+precision $\epsilon$ with only polynomially many copies of the Gibbs state, but
+which takes exponential time. Obtaining a computationally efficient algorithm
+has been a major open problem [Alhambra'22 (arXiv:2204.08349)], [Anshu,
+Arunachalam'22 (arXiv:2204.08349)], with prior work only resolving this in the
+limited cases of high temperature [Haah, Kothari, Tang'21 (arXiv:2108.04842)]
+or commuting terms [Anshu, Arunachalam, Kuwahara, Soleimanifar'21]. We fully
+resolve this problem, giving a polynomial time algorithm for learning $H$ to
+precision $\epsilon$ from polynomially many copies of the Gibbs state at any
+constant $\beta > 0$.
+  Our main technical contribution is a new flat polynomial approximation to the
+exponential function, and a translation between multi-variate scalar
+polynomials and nested commutators. This enables us to formulate Hamiltonian
+learning as a polynomial system. We then show that solving a low-degree
+sum-of-squares relaxation of this polynomial system suffices to accurately
+learn the Hamiltonian.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Model Learning Heterogeneity for Boosting Ensemble Robustness <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzhao Wu, Ka-Ho Chow, Wenqi Wei, Ling Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network ensembles hold the potential of improving generalization
+performance for complex learning tasks. This paper presents formal analysis and
+empirical evaluation to show that heterogeneous deep ensembles with high
+ensemble diversity can effectively leverage model learning heterogeneity to
+boost ensemble robustness. We first show that heterogeneous DNN models trained
+for solving the same learning problem, e.g., object detection, can
+significantly strengthen the mean average precision (mAP) through our weighted
+bounding box ensemble consensus method. Second, we further compose ensembles of
+heterogeneous models for solving different learning problems, e.g., object
+detection and semantic segmentation, by introducing the connected component
+labeling (CCL) based alignment. We show that this two-tier heterogeneity driven
+ensemble construction method can compose an ensemble team that promotes high
+ensemble diversity and low negative correlation among member models of the
+ensemble, strengthening ensemble robustness against both negative examples and
+adversarial attacks. Third, we provide a formal analysis of the ensemble
+robustness in terms of negative correlation. Extensive experiments validate the
+enhanced robustness of heterogeneous ensembles in both benign and adversarial
+settings. The source codes are available on GitHub at
+https://github.com/git-disl/HeteRobust.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ICDM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Quality Assessment of Wikipedia Articles -- A Systematic
+  Literature <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Miguel Moás, Carla Teixeira Lopes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wikipedia is the world's largest online encyclopedia, but maintaining article
+quality through collaboration is challenging. Wikipedia designed a quality
+scale, but with such a manual assessment process, many articles remain
+unassessed. We review existing methods for automatically measuring the quality
+of Wikipedia articles, identifying and comparing machine learning algorithms,
+article features, quality metrics, and used datasets, examining 149 distinct
+studies, and exploring commonalities and gaps in them. The literature is
+extensive, and the approaches follow past technological trends. However,
+machine learning is still not widely used by Wikipedia, and we hope that our
+analysis helps future researchers change that reality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 10 figures, just accepted in ACM Computing Surveys
+  (September 2023). This is the author's version of the work. It is posted here
+  for your personal use. Not for redistribution. The definitive Version of
+  Record was published in ACM Computing Surveys,
+  https://dx.doi.org/10.1145/3625286</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIS-AVioDD: Modality Invariant and Specific Representation for
+  Audio-Visual Deepfake Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinaya Sree Katamneni, Ajita Rattani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfakes are synthetic media generated using deep generative algorithms and
+have posed a severe societal and political threat. Apart from facial
+manipulation and synthetic voice, recently, a novel kind of deepfakes has
+emerged with either audio or visual modalities manipulated. In this regard, a
+new generation of multimodal audio-visual deepfake detectors is being
+investigated to collectively focus on audio and visual data for multimodal
+manipulation detection. Existing multimodal (audio-visual) deepfake detectors
+are often based on the fusion of the audio and visual streams from the video.
+Existing studies suggest that these multimodal detectors often obtain
+equivalent performances with unimodal audio and visual deepfake detectors. We
+conjecture that the heterogeneous nature of the audio and visual signals
+creates distributional modality gaps and poses a significant challenge to
+effective fusion and efficient performance. In this paper, we tackle the
+problem at the representation level to aid the fusion of audio and visual
+streams for multimodal deepfake detection. Specifically, we propose the joint
+use of modality (audio and visual) invariant and specific representations. This
+ensures that the common patterns and patterns specific to each modality
+representing pristine or fake content are preserved and fused for multimodal
+deepfake manipulation detection. Our experimental results on FakeAVCeleb and
+KoDF audio-visual deepfake datasets suggest the enhanced accuracy of our
+proposed method over SOTA unimodal and multimodal audio-visual deepfake
+detectors by $17.8$% and $18.4$%, respectively. Thus, obtaining
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Schrödinger Bridge Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan-Horng Liu, Yaron Lipman, Maximilian Nickel, Brian Karrer, Evangelos A. Theodorou, Ricky T. Q. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern distribution matching algorithms for training diffusion or flow models
+directly prescribe the time evolution of the marginal distributions between two
+boundary distributions. In this work, we consider a generalized distribution
+matching setup, where these marginals are only implicitly described as a
+solution to some task-specific objective function. The problem setup, known as
+the Generalized Schr\"odinger Bridge (GSB), appears prevalently in many
+scientific areas both within and without machine learning. We propose
+Generalized Schr\"odinger Bridge Matching (GSBM), a new matching algorithm
+inspired by recent advances, generalizing them beyond kinetic energy
+minimization and to account for task-specific state costs. We show that such a
+generalization can be cast as solving conditional stochastic optimal control,
+for which efficient variational approximations can be used, and further
+debiased with the aid of path integral theory. Compared to prior methods for
+solving GSB problems, our GSBM algorithm always preserves a feasible transport
+map between the boundary distributions throughout training, thereby enabling
+stable convergence and significantly improved scalability. We empirically
+validate our claims on an extensive suite of experimental setups, including
+crowd navigation, opinion depolarization, LiDAR manifolds, and image domain
+transfer. Our work brings new algorithmic opportunities for training diffusion
+models enhanced with task-specific optimality structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HoloNets: Spectral Convolutions do extend to Directed Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Koke, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the graph learning community, conventional wisdom dictates that
+spectral convolutional networks may only be deployed on undirected graphs: Only
+there could the existence of a well-defined graph Fourier transform be
+guaranteed, so that information may be translated between spatial- and spectral
+domains. Here we show this traditional reliance on the graph Fourier transform
+to be superfluous and -- making use of certain advanced tools from complex
+analysis and spectral theory -- extend spectral convolutions to directed
+graphs. We provide a frequency-response interpretation of newly developed
+filters, investigate the influence of the basis used to express filters and
+discuss the interplay with characteristic operators on which networks are
+based. In order to thoroughly test the developed theory, we conduct experiments
+in real world settings, showcasing that directed spectral convolutional
+networks provide new state of the art results for heterophilic node
+classification on many datasets and -- as opposed to baselines -- may be
+rendered stable to resolution-scale varying topological perturbations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SNIP: Bridging Mathematical Symbolic and Numeric Realms with Unified
+  <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazem Meidani, Parshin Shojaee, Chandan K. Reddy, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an era where symbolic mathematical equations are indispensable for
+modeling complex natural phenomena, scientific inquiry often involves
+collecting observations and translating them into mathematical expressions.
+Recently, deep learning has emerged as a powerful tool for extracting insights
+from data. However, existing models typically specialize in either numeric or
+symbolic domains, and are usually trained in a supervised manner tailored to
+specific tasks. This approach neglects the substantial benefits that could
+arise from a task-agnostic unified understanding between symbolic equations and
+their numeric counterparts. To bridge the gap, we introduce SNIP, a
+Symbolic-Numeric Integrated Pre-training, which employs joint contrastive
+learning between symbolic and numeric domains, enhancing their mutual
+similarities in the pre-trained embeddings. By performing latent space
+analysis, we observe that SNIP provides cross-domain insights into the
+representations, revealing that symbolic supervision enhances the embeddings of
+numeric data and vice versa. We evaluate SNIP across diverse tasks, including
+symbolic-to-numeric mathematical property prediction and numeric-to-symbolic
+equation discovery, commonly known as symbolic regression. Results show that
+SNIP effectively transfers to various tasks, consistently outperforming fully
+supervised baselines and competing strongly with established task-specific
+methods, especially in few-shot learning scenarios where available data is
+limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Think before you speak: Training Language Models With Pause Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sachin Goyal, Ziwei Ji, Ankit Singh Rawat, Aditya Krishna Menon, Sanjiv Kumar, Vaishnavh Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models generate responses by producing a series of tokens in
+immediate succession: the $(K+1)^{th}$ token is an outcome of manipulating $K$
+hidden vectors per layer, one vector per preceding token. What if instead we
+were to let the model manipulate say, $K+10$ hidden vectors, before it outputs
+the $(K+1)^{th}$ token? We operationalize this idea by performing training and
+inference on language models with a (learnable) $\textit{pause}$ token, a
+sequence of which is appended to the input prefix. We then delay extracting the
+model's outputs until the last pause token is seen, thereby allowing the model
+to process extra computation before committing to an answer. We empirically
+evaluate $\textit{pause-training}$ on decoder-only models of 1B and 130M
+parameters with causal pretraining on C4, and on downstream tasks covering
+reasoning, question-answering, general understanding and fact recall. Our main
+finding is that inference-time delays show gains when the model is both
+pre-trained and finetuned with delays. For the 1B model, we witness gains on 8
+of 9 tasks, most prominently, a gain of $18\%$ EM score on the QA task of
+SQuAD, $8\%$ on CommonSenseQA and $1\%$ accuracy on the reasoning task of
+GSM8k. Our work raises a range of conceptual and practical future research
+questions on making delayed next-token prediction a widely applicable new
+paradigm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structurally guided task decomposition in spatial navigation tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi He, Carlos G. Correa, Thomas L. Griffiths, Mark K. Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How are people able to plan so efficiently despite limited cognitive
+resources? We aimed to answer this question by extending an existing model of
+human task decomposition that can explain a wide range of simple planning
+problems by adding structure information to the task to facilitate planning in
+more complex tasks. The extended model was then applied to a more complex
+planning domain of spatial navigation. Our results suggest that our framework
+can correctly predict the navigation strategies of the majority of the
+participants in an online experiment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What do we learn from a large-scale study of <span class="highlight-title">pre-train</span>ed visual
+  representations in sim and real environments? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneha Silwal, Karmesh Yadav, Tingfan Wu, Jay Vakil, Arjun Majumdar, Sergio Arnaud, Claire Chen, Vincent-Pierre Berges, Dhruv Batra, Aravind Rajeswaran, Mrinal Kalakrishnan, Franziska Meier, Oleksandr Maksymets
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a large empirical investigation on the use of pre-trained visual
+representations (PVRs) for training downstream policies that execute real-world
+tasks. Our study spans five different PVRs, two different policy-learning
+paradigms (imitation and reinforcement learning), and three different robots
+for 5 distinct manipulation and indoor navigation tasks. From this effort, we
+can arrive at three insights: 1) the performance trends of PVRs in the
+simulation are generally indicative of their trends in the real world, 2) the
+use of PVRs enables a first-of-its-kind result with indoor ImageNav (zero-shot
+transfer to a held-out scene in the real world), and 3) the benefits from
+variations in PVRs, primarily data-augmentation and fine-tuning, also transfer
+to the real-world performance. See project website for additional details and
+visuals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website https://pvrs-sim2real.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An experimental system for detection and localization of hemorrhage
+  using ultra-wideband microwaves with deep learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eisa Hedayati, Fatemeh Safari, George Verghese, Vito R. Ciancia, Daniel K. Sodickson, Seena Dehkharghani, Leeor Alon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stroke is a leading cause of mortality and disability. Emergent diagnosis and
+intervention are critical, and predicated upon initial brain imaging; however,
+existing clinical imaging modalities are generally costly, immobile, and demand
+highly specialized operation and interpretation. Low-energy microwaves have
+been explored as low-cost, small form factor, fast, and safe probes of tissue
+dielectric properties, with both imaging and diagnostic potential.
+Nevertheless, challenges inherent to microwave reconstruction have impeded
+progress, hence microwave imaging (MWI) remains an elusive scientific aim.
+Herein, we introduce a dedicated experimental framework comprising a robotic
+navigation system to translate blood-mimicking phantoms within an anatomically
+realistic human head model. An 8-element ultra-wideband (UWB) array of modified
+antipodal Vivaldi antennas was developed and driven by a two-port vector
+network analyzer spanning 0.6-9.0 GHz at an operating power of 1 mw. Complex
+scattering parameters were measured, and dielectric signatures of hemorrhage
+were learned using a dedicated deep neural network for prediction of hemorrhage
+classes and localization. An overall sensitivity and specificity for detection
+>0.99 was observed, with Rayliegh mean localization error of 1.65 mm. The study
+establishes the feasibility of a robust experimental model and deep learning
+solution for UWB microwave stroke detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models Represent Space and Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wes Gurnee, Max Tegmark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The capabilities of large language models (LLMs) have sparked debate over
+whether such systems just learn an enormous collection of superficial
+statistics or a coherent model of the data generating process -- a world model.
+We find evidence for the latter by analyzing the learned representations of
+three spatial datasets (world, US, NYC places) and three temporal datasets
+(historical figures, artworks, news headlines) in the Llama-2 family of models.
+We discover that LLMs learn linear representations of space and time across
+multiple scales. These representations are robust to prompting variations and
+unified across different entity types (e.g. cities and landmarks). In addition,
+we identify individual ``space neurons'' and ``time neurons'' that reliably
+encode spatial and temporal coordinates. Our analysis demonstrates that modern
+LLMs acquire structured knowledge about fundamental dimensions such as space
+and time, supporting the view that they learn not merely superficial
+statistics, but literal world models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chunking: Forgetting Matters in Continual Learning even without Changing
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas L. Lee, Amos Storkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Work on continual learning (CL) has largely focused on the problems arising
+from the dynamically-changing data distribution. However, CL can be decomposed
+into two sub-problems: (a) shifts in the data distribution, and (b) dealing
+with the fact that the data is split into chunks and so only a part of the data
+is available to be trained on at any point in time. In this work, we look at
+the latter sub-problem -- the chunking of data -- and note that previous
+analysis of chunking in the CL literature is sparse. We show that chunking is
+an important part of CL, accounting for around half of the performance drop
+from offline learning in our experiments. Furthermore, our results reveal that
+current CL algorithms do not address the chunking sub-problem, only performing
+as well as plain SGD training when there is no shift in the data distribution.
+We analyse why performance drops when learning occurs on chunks of data, and
+find that forgetting, which is often seen to be a problem due to distribution
+shift, still arises and is a significant problem. Motivated by an analysis of
+the linear case, we show that per-chunk weight averaging improves performance
+in the chunking setting and that this performance transfers to the full CL
+setting. Hence, we argue that work on chunking can help advance CL in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 11 figures, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Quantification in Inverse Models in Hydrology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somya Sharma Chatterjee, Rahul Ghosh, Arvind Renganathan, Xiang Li, Snigdhansu Chatterjee, John Nieber, Christopher Duffy, Vipin Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In hydrology, modeling streamflow remains a challenging task due to the
+limited availability of basin characteristics information such as soil geology
+and geomorphology. These characteristics may be noisy due to measurement errors
+or may be missing altogether. To overcome this challenge, we propose a
+knowledge-guided, probabilistic inverse modeling method for recovering physical
+characteristics from streamflow and weather data, which are more readily
+available. We compare our framework with state-of-the-art inverse models for
+estimating river basin characteristics. We also show that these estimates offer
+improvement in streamflow modeling as opposed to using the original basin
+characteristic values. Our inverse model offers 3\% improvement in R$^2$ for
+the inverse model (basin characteristic estimation) and 6\% for the forward
+model (streamflow prediction). Our framework also offers improved
+explainability since it can quantify uncertainty in both the inverse and the
+forward model. Uncertainty quantification plays a pivotal role in improving the
+explainability of machine learning models by providing additional insights into
+the reliability and limitations of model predictions. In our analysis, we
+assess the quality of the uncertainty estimates. Compared to baseline
+uncertainty quantification methods, our framework offers 10\% improvement in
+the dispersion of epistemic uncertainty and 13\% improvement in coverage rate.
+This information can help stakeholders understand the level of uncertainty
+associated with the predictions and provide a more comprehensive view of the
+potential outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2210.06213</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ask Again, Then Fail: Large Language Models' Vacillations in Judgement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Xie, Zengzhi Wang, Yi Feng, Rui Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of generative conversational large language models (LLMs)
+like ChatGPT, serving as virtual assistants in various fields, the stability
+and reliability of their responses have become crucial. However, during usage,
+it has been observed that these models tend to waver in their judgements when
+confronted with follow-up questions from users expressing skepticism or
+disagreement. In this work, we draw inspiration from questioning strategies in
+education and propose a \textsc{Follow-up Questioning Mechanism} along with two
+evaluation metrics to assess the judgement consistency of LLMs before and after
+exposure to disturbances. We evaluate the judgement consistency of ChatGPT,
+PaLM2-Bison, and Vicuna-13B under this mechanism across eight reasoning
+benchmarks. Empirical results show that even when the initial answers are
+correct, judgement consistency sharply decreases when LLMs face disturbances
+such as questioning, negation, or misleading. Additionally, we study these
+models' judgement consistency under various settings (sampling temperature and
+prompts) to validate this issue further, observing the impact of prompt tone
+and conducting an in-depth error analysis for deeper behavioral insights.
+Furthermore, we also explore several prompting methods to mitigate this issue
+and demonstrate their
+effectiveness\footnote{\url{https://github.com/NUSTM/LLMs-Waver-In-Judgements}}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lyfe Agents: Generative agents for low-cost real-time social
+  interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Kaiya, Michelangelo Naim, Jovana Kondic, Manuel Cortes, Jiaxin Ge, Shuying Luo, Guangyu Robert Yang, Andrew Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Highly autonomous generative agents powered by large language models promise
+to simulate intricate social behaviors in virtual societies. However, achieving
+real-time interactions with humans at a low computational cost remains
+challenging. Here, we introduce Lyfe Agents. They combine low-cost with
+real-time responsiveness, all while remaining intelligent and goal-oriented.
+Key innovations include: (1) an option-action framework, reducing the cost of
+high-level decisions; (2) asynchronous self-monitoring for better
+self-consistency; and (3) a Summarize-and-Forget memory mechanism, prioritizing
+critical memory items at a low cost. We evaluate Lyfe Agents' self-motivation
+and sociability across several multi-agent scenarios in our custom LyfeGame 3D
+virtual environment platform. When equipped with our brain-inspired techniques,
+Lyfe Agents can exhibit human-like self-motivated social reasoning. For
+example, the agents can solve a crime (a murder mystery) through autonomous
+collaboration and information exchange. Meanwhile, our techniques enabled Lyfe
+Agents to operate at a computational cost 10-100 times lower than existing
+alternatives. Our findings underscore the transformative potential of
+autonomous generative agents to enrich human social experiences in virtual
+worlds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Editing Personality for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Mao, Ningyu Zhang, Xiaohan Wang, Mengru Wang, Yunzhi Yao, Yong Jiang, Pengjun Xie, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an innovative task focused on editing the personality
+traits of Large Language Models (LLMs). This task seeks to adjust the models'
+responses to opinion-related questions on specified topics since an
+individual's personality often manifests in the form of their expressed
+opinions, thereby showcasing different personality traits. Specifically, we
+construct a new benchmark dataset PersonalityEdit to address this task. Drawing
+on the theory in Social Psychology, we isolate three representative traits,
+namely Neuroticism, Extraversion, and Agreeableness, as the foundation for our
+benchmark. We then gather data using GPT-4, generating responses that not only
+align with a specified topic but also embody the targeted personality trait. We
+conduct comprehensive experiments involving various baselines and discuss the
+representation of personality behavior in LLMs. Our intriguing findings uncover
+potential challenges of the proposed task, illustrating several remaining
+issues. We anticipate that our work can provide the NLP community with
+insights. Code and datasets will be released at
+https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistically Rewired Message-Passing Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chendi Qian, Andrei Manolache, Kareem Ahmed, Zhe Zeng, Guy Van den Broeck, Mathias Niepert, Christopher Morris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message-passing graph neural networks (MPNNs) emerged as powerful tools for
+processing graph-structured input. However, they operate on a fixed input graph
+structure, ignoring potential noise and missing information. Furthermore, their
+local aggregation mechanism can lead to problems such as over-squashing and
+limited expressive power in capturing relevant graph structures. Existing
+solutions to these challenges have primarily relied on heuristic methods, often
+disregarding the underlying data distribution. Hence, devising principled
+approaches for learning to infer graph structures relevant to the given
+prediction task remains an open challenge. In this work, leveraging recent
+progress in exact and differentiable $k$-subset sampling, we devise
+probabilistically rewired MPNNs (PR-MPNNs), which learn to add relevant edges
+while omitting less beneficial ones. For the first time, our theoretical
+analysis explores how PR-MPNNs enhance expressive power, and we identify
+precise conditions under which they outperform purely randomized approaches.
+Empirically, we demonstrate that our approach effectively mitigates issues like
+over-squashing and under-reaching. In addition, on established real-world
+datasets, our method exhibits competitive or superior predictive performance
+compared to traditional MPNN models and recent graph transformer architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Network-based EEG Classification: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Klepl, Min Wu, Fei He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNN) are increasingly used to classify EEG for tasks
+such as emotion recognition, motor imagery and neurological diseases and
+disorders. A wide range of methods have been proposed to design GNN-based
+classifiers. Therefore, there is a need for a systematic review and
+categorisation of these approaches. We exhaustively search the published
+literature on this topic and derive several categories for comparison. These
+categories highlight the similarities and differences among the methods. The
+results suggest a prevalence of spectral graph convolutional layers over
+spatial. Additionally, we identify standard forms of node features, with the
+most popular being the raw EEG signal and differential entropy. Our results
+summarise the emerging trends in GNN-based approaches for EEG classification.
+Finally, we discuss several promising research directions, such as exploring
+the potential of transfer learning methods and appropriate modelling of
+cross-frequency interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finite-Time Analysis of Whittle Index based Q-Learning for Restless
+  Multi-Armed Bandits with Neural Network Function Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guojun Xiong, Jian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whittle index policy is a heuristic to the intractable restless multi-armed
+bandits (RMAB) problem. Although it is provably asymptotically optimal, finding
+Whittle indices remains difficult. In this paper, we present Neural-Q-Whittle,
+a Whittle index based Q-learning algorithm for RMAB with neural network
+function approximation, which is an example of nonlinear two-timescale
+stochastic approximation with Q-function values updated on a faster timescale
+and Whittle indices on a slower timescale. Despite the empirical success of
+deep Q-learning, the non-asymptotic convergence rate of Neural-Q-Whittle, which
+couples neural networks with two-timescale Q-learning largely remains unclear.
+This paper provides a finite-time analysis of Neural-Q-Whittle, where data are
+generated from a Markov chain, and Q-function is approximated by a ReLU neural
+network. Our analysis leverages a Lyapunov drift approach to capture the
+evolution of two coupled parameters, and the nonlinearity in value function
+approximation further requires us to characterize the approximation error.
+Combing these provide Neural-Q-Whittle with $\mathcal{O}(1/k^{2/3})$
+convergence rate, where $k$ is the number of iterations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 4 figures, Neurips 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Reliable Logical Rules with SATNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoyu Li, Jinpei Guo, Yuhe Jiang, Xujie Si
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bridging logical reasoning and deep learning is crucial for advanced AI
+systems. In this work, we present a new framework that addresses this goal by
+generating interpretable and verifiable logical rules through differentiable
+learning, without relying on pre-specified logical structures. Our approach
+builds upon SATNet, a differentiable MaxSAT solver that learns the underlying
+rules from input-output examples. Despite its efficacy, the learned weights in
+SATNet are not straightforwardly interpretable, failing to produce
+human-readable rules. To address this, we propose a novel specification method
+called "maximum equality", which enables the interchangeability between the
+learned weights of SATNet and a set of propositional logical rules in weighted
+MaxSAT form. With the decoded weighted MaxSAT formula, we further introduce
+several effective verification techniques to validate it against the ground
+truth rules. Experiments on stream transformations and Sudoku problems show
+that our decoded rules are highly reliable: using exact solvers on them could
+achieve 100% accuracy, whereas the original SATNet fails to give correct
+solutions in many cases. Furthermore, we formally verify that our decoded
+logical rules are functionally equivalent to the ground truth ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhoubo Li, Ningyu Zhang, Yunzhi Yao, Mengru Wang, Xi Chen, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the cost associated with fine-tuning Large Language Models (LLMs)
+continues to rise, recent research efforts have pivoted towards developing
+methodologies to edit implicit knowledge embedded within LLMs. Yet, there's
+still a dark cloud lingering overhead -- will knowledge editing trigger
+butterfly effect? since it is still unclear whether knowledge editing might
+introduce side effects that pose potential risks or not. This paper pioneers
+the investigation into the potential pitfalls associated with knowledge editing
+for LLMs. To achieve this, we introduce new benchmark datasets and propose
+innovative evaluation metrics. Our results underline two pivotal concerns: (1)
+Knowledge Conflict: Editing groups of facts that logically clash can magnify
+the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)
+Knowledge Distortion: Altering parameters with the aim of editing factual
+knowledge can irrevocably warp the innate knowledge structure of LLMs.
+Experimental results vividly demonstrate that knowledge editing might
+inadvertently cast a shadow of unintended consequences on LLMs, which warrant
+attention and efforts for future works. Code will be released at
+https://github.com/zjunlp/PitfallsKnowledgeEditing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Collaboration Mechanisms for LLM Agents: A Social Psychology
+  View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintian Zhang, Xin Xu, Shumin Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Natural Language Processing (NLP) systems are increasingly employed in
+intricate social environments, a pressing query emerges: Can these NLP systems
+mirror human-esque collaborative intelligence, in a multi-agent society
+consisting of multiple large language models (LLMs)? This paper probes the
+collaboration mechanisms among contemporary NLP systems by melding practical
+experiments with theoretical insights. We fabricate four unique `societies'
+comprised of LLM agents, where each agent is characterized by a specific
+`trait' (easy-going or overconfident) and engages in collaboration with a
+distinct `thinking pattern' (debate or reflection). Evaluating these
+multi-agent societies on three benchmark datasets, we discern that LLM agents
+navigate tasks by leveraging diverse social behaviors, from active debates to
+introspective reflections. Notably, certain collaborative strategies only
+optimize efficiency (using fewer API tokens), but also outshine previous
+top-tier approaches. Moreover, our results further illustrate that LLM agents
+manifest human-like social behaviors, such as conformity or majority rule,
+mirroring foundational Social Psychology theories. In conclusion, we integrate
+insights from Social Psychology to contextualize the collaboration of LLM
+agents, inspiring further investigations into the collaboration mechanism for
+LLMs. We commit to sharing our code and datasets (already submitted in
+supplementary materials), hoping to catalyze further research in this promising
+avenue (All code and data are available at
+\url{https://github.com/zjunlp/MachineSoM}.).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetric Single Index Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Zweig, Joan Bruna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few neural architectures lend themselves to provable learning with gradient
+based methods. One popular model is the single-index model, in which labels are
+produced by composing an unknown linear projection with a possibly unknown
+scalar link function. Learning this model with SGD is relatively
+well-understood, whereby the so-called information exponent of the link
+function governs a polynomial sample complexity rate. However, extending this
+analysis to deeper or more complicated architectures remains challenging.
+  In this work, we consider single index learning in the setting of symmetric
+neural networks. Under analytic assumptions on the activation and maximum
+degree assumptions on the link function, we prove that gradient flow recovers
+the hidden planted direction, represented as a finitely supported vector in the
+feature space of power sum polynomials. We characterize a notion of information
+exponent adapted to our setting that controls the efficiency of learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Concept Discovery Models: A Concept Pyramid Scheme 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantinos P. Panousis, Dino Ienco, Diego Marcos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning algorithms have recently gained significant attention due to
+their impressive performance. However, their high complexity and
+un-interpretable mode of operation hinders their confident deployment in
+real-world safety-critical tasks. This work targets ante hoc interpretability,
+and specifically Concept Bottleneck Models (CBMs). Our goal is to design a
+framework that admits a highly interpretable decision making process with
+respect to human understandable concepts, on multiple levels of granularity. To
+this end, we propose a novel hierarchical concept discovery formulation
+leveraging: (i) recent advances in image-text models, and (ii) an innovative
+formulation for multi-level concept selection via data-driven and sparsity
+inducing Bayesian arguments. Within this framework, concept information does
+not solely rely on the similarity between the whole image and general
+unstructured concepts; instead, we introduce the notion of concept hierarchy to
+uncover and exploit more granular concept information residing in
+patch-specific regions of the image scene. As we experimentally show, the
+proposed construction not only outperforms recent CBM approaches, but also
+yields a principled framework towards interpetability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLEDGE: Ledger-based Federated Learning Resilient to Inference and
+  Backdoor Attacks <span class="chip">ACSA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Castillo, Phillip Rieger, Hossein Fereidooni, Qian Chen, Ahmad Sadeghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a distributed learning process that uses a trusted
+aggregation server to allow multiple parties (or clients) to collaboratively
+train a machine learning model without having them share their private data.
+Recent research, however, has demonstrated the effectiveness of inference and
+poisoning attacks on FL. Mitigating both attacks simultaneously is very
+challenging. State-of-the-art solutions have proposed the use of poisoning
+defenses with Secure Multi-Party Computation (SMPC) and/or Differential Privacy
+(DP). However, these techniques are not efficient and fail to address the
+malicious intent behind the attacks, i.e., adversaries (curious servers and/or
+compromised clients) seek to exploit a system for monetization purposes. To
+overcome these limitations, we present a ledger-based FL framework known as
+FLEDGE that allows making parties accountable for their behavior and achieve
+reasonable efficiency for mitigating inference and poisoning attacks. Our
+solution leverages crypto-currency to increase party accountability by
+penalizing malicious behavior and rewarding benign conduct. We conduct an
+extensive evaluation on four public datasets: Reddit, MNIST, Fashion-MNIST, and
+CIFAR-10. Our experimental results demonstrate that (1) FLEDGE provides strong
+privacy guarantees for model updates without sacrificing model utility; (2)
+FLEDGE can successfully mitigate different poisoning attacks without degrading
+the performance of the global model; and (3) FLEDGE offers unique reward
+mechanisms to promote benign behavior during model training and/or model
+aggregation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Annual Computer Security Applications Conference (ACSAC)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoNO: Complex Neural Operator for Continuous Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn Tiwari, N M Anoop Krishnan, Prathosh A P
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural operators extend data-driven models to map between
+infinite-dimensional functional spaces. These models have successfully solved
+continuous dynamical systems represented by differential equations, viz weather
+forecasting, fluid flow, or solid mechanics. However, the existing operators
+still rely on real space, thereby losing rich representations potentially
+captured in the complex space by functional transforms. In this paper, we
+introduce a Complex Neural Operator (CoNO), that parameterizes the integral
+kernel in the complex fractional Fourier domain. Additionally, the model
+employing a complex-valued neural network along with aliasing-free activation
+functions preserves the complex values and complex algebraic properties,
+thereby enabling improved representation, robustness to noise, and
+generalization. We show that the model effectively captures the underlying
+partial differential equation with a single complex fractional Fourier
+transform. We perform an extensive empirical evaluation of CoNO on several
+datasets and additional tasks such as zero-shot super-resolution, evaluation of
+out-of-distribution data, data efficiency, and robustness to noise. CoNO
+exhibits comparable or superior performance to all the state-of-the-art models
+in these tasks. Altogether, CoNO presents a robust and superior model for
+modeling continuous dynamical systems, providing a fillip to scientific machine
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Gradient Descent with Preconditioned Polyak Step-size 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farshed Abdukhakimov, Chulu Xiang, Dmitry Kamzolov, Martin Takáč
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic Gradient Descent (SGD) is one of the many iterative optimization
+methods that are widely used in solving machine learning problems. These
+methods display valuable properties and attract researchers and industrial
+machine learning engineers with their simplicity. However, one of the
+weaknesses of this type of methods is the necessity to tune learning rate
+(step-size) for every loss function and dataset combination to solve an
+optimization problem and get an efficient performance in a given time budget.
+Stochastic Gradient Descent with Polyak Step-size (SPS) is a method that offers
+an update rule that alleviates the need of fine-tuning the learning rate of an
+optimizer. In this paper, we propose an extension of SPS that employs
+preconditioning techniques, such as Hutchinson's method, Adam, and AdaGrad, to
+improve its performance on badly scaled and/or ill-conditioned datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 1D-CapsNet-LSTM: A Deep Learning-Based Model for Multi-Step Stock Index
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02090v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02090v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Zhang, Nilam Nur Amir Sjarif, Roslina Ibrahim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-step forecasting of stock market index prices is a crucial task in the
+financial sector, playing a pivotal role in decision-making across various
+financial activities. However, forecasting results are often unsatisfactory
+owing to the stochastic and volatile nature of the data. Researchers have made
+various attempts, and this process is ongoing. Inspired by convolutional neural
+network long short-term memory (CNN-LSTM) networks that utilize a 1D CNN for
+feature extraction to boost model performance, this study explores the use of a
+capsule network (CapsNet) as an advanced feature extractor in an LSTM-based
+forecasting model to enhance multi-step predictions. To this end, a novel
+neural architecture called 1D-CapsNet-LSTM was introduced, which combines a 1D
+CapsNet to extract high-level features from 1D sequential data and an LSTM
+layer to capture the temporal dependencies between the previously extracted
+features and uses a multi-input multi-output (MIMO) strategy to maintain the
+stochastic dependencies between the predicted values at different time steps.
+The proposed model was evaluated based on several real-world stock market
+indices, including Standard & Poor's 500 (S&P 500), Dow Jones Industrial
+Average (DJIA), Nasdaq Composite Index (IXIC), and New York Stock Exchange
+(NYSE), and was compared with baseline models such as LSTM, recurrent neural
+network (RNN), and CNN-LSTM in terms of various evaluation metrics. The
+comparison results suggest that the 1D-CapsNet-LSTM model outperforms the
+baseline models and has immense potential for the effective handling of complex
+prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Quantum Processes with Quantum Statistical Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chirag Wadhwa, Mina Doosti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning complex quantum processes is a central challenge in many areas of
+quantum computing and quantum machine learning, with applications in quantum
+benchmarking, cryptanalysis, and variational quantum algorithms. This paper
+introduces the first learning framework for studying quantum process learning
+within the Quantum Statistical Query (QSQ) model, providing the first formal
+definition of statistical queries to quantum processes (QPSQs). The framework
+allows us to propose an efficient QPSQ learner for arbitrary quantum processes
+accompanied by a provable performance guarantee. We also provide numerical
+simulations to demonstrate the efficacy of this algorithm. The practical
+relevance of this framework is exemplified through application in
+cryptanalysis, highlighting vulnerabilities of Classical-Readout Quantum
+Physical Unclonable Functions (CR-QPUFs), addressing an important open question
+in the field of quantum hardware security. This work marks a significant step
+towards understanding the learnability of quantum processes and shedding light
+on their security implications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACE: A fast, skillful learned global atmospheric model for climate
+  prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Watt-Meyer, Gideon Dresdner, Jeremy McGibbon, Spencer K. Clark, Brian Henn, James Duncan, Noah D. Brenowitz, Karthik Kashinath, Michael S. Pritchard, Boris Bonev, Matthew E. Peters, Christopher S. Bretherton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing ML-based atmospheric models are not suitable for climate prediction,
+which requires long-term stability and physical consistency. We present ACE
+(AI2 Climate Emulator), a 200M-parameter, autoregressive machine learning
+emulator of an existing comprehensive 100-km resolution global atmospheric
+model. The formulation of ACE allows evaluation of physical laws such as the
+conservation of mass and moisture. The emulator is stable for 10 years, nearly
+conserves column moisture without explicit constraints and faithfully
+reproduces the reference model's climate, outperforming a challenging baseline
+on over 80% of tracked variables. ACE requires nearly 100x less wall clock time
+and is 100x more energy efficient than the reference model using typically
+available resources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ De Novo Drug Design with Joint <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Izdebski, Ewelina Weglarz-Tomczak, Ewa Szczurek, Jakub M. Tomczak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  De novo drug design requires simultaneously generating novel molecules
+outside of training data and predicting their target properties, making it a
+hard task for generative models. To address this, we propose Joint Transformer
+that combines a Transformer decoder, a Transformer encoder, and a predictor in
+a joint generative model with shared weights. We show that training the model
+with a penalized log-likelihood objective results in state-of-the-art
+performance in molecule generation, while decreasing the prediction error on
+newly sampled molecules, as compared to a fine-tuned decoder-only Transformer,
+by 42%. Finally, we propose a probabilistic black-box optimization algorithm
+that employs Joint Transformer to generate novel molecules with improved target
+properties, as compared to the training data, outperforming other SMILES-based
+optimization methods in de novo drug design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VENOM: A Vectorized N:M Format for Unleashing the Power of Sparse Tensor
+  Cores <span class="chip">SC'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto L. Castro, Andrei Ivanov, Diego Andrade, Tal Ben-Nun, Basilio B. Fraguela, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing success and scaling of Deep Learning models demands higher
+computational efficiency and power. Sparsification can lead to both smaller
+models as well as higher compute efficiency, and accelerated hardware is
+becoming available. However, exploiting it efficiently requires kernel
+implementations, pruning algorithms, and storage formats, to utilize hardware
+support of specialized sparse vector units. An example of those are the
+NVIDIA's Sparse Tensor Cores (SPTCs), which promise a 2x speedup. However,
+SPTCs only support the 2:4 format, limiting achievable sparsity ratios to 50%.
+We present the V:N:M format, which enables the execution of arbitrary N:M
+ratios on SPTCs. To efficiently exploit the resulting format, we propose
+Spatha, a high-performance sparse-library for DL routines. We show that Spatha
+achieves up to 37x speedup over cuBLAS. We also demonstrate a second-order
+pruning technique that enables sparsification to high sparsity ratios with
+V:N:M and little to no loss in accuracy in modern transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2023 International Conference on High Performance
+  Computing, Networking, Storage and Analysis, 2023 (SC'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lessons Learned from EXMOS User Studies: A Technical Report Summarizing
+  Key Takeaways from User Studies Conducted to Evaluate The EXMOS Platform 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Bhattacharya, Simone Stumpf, Lucija Gosak, Gregor Stiglic, Katrien Verbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of interactive machine-learning systems, the provision of
+explanations serves as a vital aid in the processes of debugging and enhancing
+prediction models. However, the extent to which various global model-centric
+and data-centric explanations can effectively assist domain experts in
+detecting and resolving potential data-related issues for the purpose of model
+improvement has remained largely unexplored. In this technical report, we
+summarise the key findings of our two user studies. Our research involved a
+comprehensive examination of the impact of global explanations rooted in both
+data-centric and model-centric perspectives within systems designed to support
+healthcare experts in optimising machine learning models through both automated
+and manual data configurations. To empirically investigate these dynamics, we
+conducted two user studies, comprising quantitative analysis involving a sample
+size of 70 healthcare experts and qualitative assessments involving 30
+healthcare experts. These studies were aimed at illuminating the influence of
+different explanation types on three key dimensions: trust, understandability,
+and model improvement. Results show that global model-centric explanations
+alone are insufficient for effectively guiding users during the intricate
+process of data configuration. In contrast, data-centric explanations exhibited
+their potential by enhancing the understanding of system changes that occur
+post-configuration. However, a combination of both showed the highest level of
+efficacy for fostering trust, improving understandability, and facilitating
+model enhancement among healthcare experts. We also present essential
+implications for developing interactive machine-learning systems driven by
+explanations. These insights can guide the creation of more effective systems
+that empower domain experts to harness the full potential of machine learning
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>It is a technical report only. The contents are not peer-reviewed.
+  Please reach out to the main author for any questions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Inhibitor: ReLU and Addition-Based Attention for Efficient
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rickard Brännvall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance the computational efficiency of quantized Transformers, we replace
+the dot-product and Softmax-based attention with an alternative mechanism
+involving addition and ReLU activation only. This side-steps the expansion to
+double precision often required by matrix multiplication and avoids costly
+Softmax evaluations but maintains much of the core functionality of
+conventional dot-product attention. It can enable more efficient execution and
+support larger quantized Transformer models on resource-constrained hardware or
+alternative arithmetic systems like homomorphic encryption. Training
+experiments on four common benchmark tasks show test set prediction scores
+comparable to those of conventional Transformers with dot-product attention.
+Our scaling experiments also suggest significant computational savings, both in
+plaintext and under encryption. In particular, we believe that the ReLU and
+addition-based attention mechanism introduced in this paper may enable
+privacy-preserving AI applications operating under homomorphic encryption by
+avoiding the costly multiplication of encrypted variables.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ aSAGA: Automatic Sleep Analysis with Gray Areas 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matias Rusanen, Gabriel Jouan, Riku Huttunen, Sami Nikkonen, Sigríður Sigurðardóttir, Juha Töyräs, Brett Duce, Sami Myllymaa, Erna Sif Arnardottir, Timo Leppänen, Anna Sigridur Islind, Samu Kainulainen, Henri Korkalainen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art automatic sleep staging methods have already demonstrated
+comparable reliability and superior time efficiency to manual sleep staging.
+However, fully automatic black-box solutions are difficult to adapt into
+clinical workflow and the interaction between explainable automatic methods and
+the work of sleep technologists remains underexplored and inadequately
+conceptualized. Thus, we propose a human-in-the-loop concept for sleep
+analysis, presenting an automatic sleep staging model (aSAGA), that performs
+effectively with both clinical polysomnographic recordings and home sleep
+studies. To validate the model, extensive testing was conducted, employing a
+preclinical validation approach with three retrospective datasets; open-access,
+clinical, and research-driven. Furthermore, we validate the utilization of
+uncertainty mapping to identify ambiguous regions, conceptualized as gray
+areas, in automatic sleep analysis that warrants manual re-evaluation. The
+results demonstrate that the automatic sleep analysis achieved a comparable
+level of agreement with manual analysis across different sleep recording types.
+Moreover, validation of the gray area concept revealed its potential to enhance
+sleep staging accuracy and identify areas in the recordings where sleep
+technologists struggle to reach a consensus. In conclusion, this study
+introduces and validates a concept from explainable artificial intelligence
+into sleep medicine and provides the basis for integrating human-in-the-loop
+automatic sleep staging into clinical workflows, aiming to reduce black-box
+criticism and the burden associated with manual sleep staging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ocean<span class="highlight-title">GPT</span>: A Large Language Model for Ocean Science Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Bi, Ningyu Zhang, Yida Xue, Yixin Ou, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocean science, which delves into the oceans that are reservoirs of life and
+biodiversity, is of great significance given that oceans cover over 70% of our
+planet's surface. Recently, advances in Large Language Models (LLMs) have
+transformed the paradigm in science. Despite the success in other domains,
+current LLMs often fall short in catering to the needs of domain experts like
+oceanographers, and the potential of LLMs for ocean science is under-explored.
+The intrinsic reason may be the immense and intricate nature of ocean data as
+well as the necessity for higher granularity and richness in knowledge. To
+alleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean
+domain, which is expert in various ocean science tasks. We propose DoInstruct,
+a novel framework to automatically obtain a large volume of ocean domain
+instruction data, which generates instructions based on multi-agent
+collaboration. Additionally, we construct the first oceanography benchmark,
+OceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though
+comprehensive experiments, OceanGPT not only shows a higher level of knowledge
+expertise for oceans science tasks but also gains preliminary embodied
+intelligence capabilities in ocean technology. Codes, data and checkpoints will
+soon be available at https://github.com/zjunlp/KnowLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Project Website:
+  https://zjunlp.github.io/project/OceanGPT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Between accurate prediction and poor decision making: the AI/ML gap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02029v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02029v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Bontempi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent agents rely on AI/ML functionalities to predict the consequence
+of possible actions and optimise the policy. However, the effort of the
+research community in addressing prediction accuracy has been so intense (and
+successful) that it created the illusion that the more accurate the learner
+prediction (or classification) the better would have been the final decision.
+Now, such an assumption is valid only if the (human or artificial) decision
+maker has complete knowledge of the utility of the possible actions. This paper
+argues that AI/ML community has taken so far a too unbalanced approach by
+devoting excessive attention to the estimation of the state (or target)
+probability to the detriment of accurate and reliable estimations of the
+utility. In particular, few evidence exists about the impact of a wrong utility
+assessment on the resulting expected utility of the decision strategy. This
+situation is creating a substantial gap between the expectations and the
+effective impact of AI solutions, as witnessed by recent criticisms and
+emphasised by the regulatory legislative efforts. This paper aims to study this
+gap by quantifying the sensitivity of the expected utility to the utility
+uncertainty and comparing it to the one due to probability estimation.
+Theoretical and simulated results show that an inaccurate utility assessment
+may as (and sometimes) more harmful than a poor probability estimation. The
+final recommendation to the community is then to undertake a focus shift from a
+pure accuracy-driven (or obsessed) approach to a more utility-aware
+methodology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Position paper presented in the BENELEARN 2022 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepHGCN: Toward Deeper Hyperbolic Graph Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxu Liu, Xinping Yi, Xiaowei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperbolic graph convolutional networks (HGCN) have demonstrated significant
+potential in extracting information from hierarchical graphs. However, existing
+HGCNs are limited to shallow architectures, due to the expensive hyperbolic
+operations and the over-smoothing issue as depth increases. Although in GCNs,
+treatments have been applied to alleviate over-smoothing, developing a
+hyperbolic therapy presents distinct challenges since operations should be
+carefully designed to fit the hyperbolic nature. Addressing the above
+challenges, in this work, we propose DeepHGCN, the first deep multi-layer HGCN
+architecture with dramatically improved computational efficiency and
+substantially alleviated over-smoothing effect. DeepHGCN presents two key
+enablers of deep HGCNs: (1) a novel hyperbolic feature transformation layer
+that enables fast and accurate linear maps; and (2) Techniques such as
+hyperbolic residual connections and regularization for both weights and
+features facilitated by an efficient hyperbolic midpoint method. Extensive
+experiments demonstrate that DeepHGCN obtains significant improvements in link
+prediction and node classification tasks compared to both Euclidean and shallow
+hyperbolic GCN variants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages including appendix and reference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepZero: Scaling up Zeroth-Order Optimization for Deep Model Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aochuan Chen, Yimeng Zhang, Jinghan Jia, James Diffenderfer, Jiancheng Liu, Konstantinos Parasyris, Yihua Zhang, Zheng Zhang, Bhavya Kailkhura, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zeroth-order (ZO) optimization has become a popular technique for solving
+machine learning (ML) problems when first-order (FO) information is difficult
+or impossible to obtain. However, the scalability of ZO optimization remains an
+open problem: Its use has primarily been limited to relatively small-scale ML
+problems, such as sample-wise adversarial attack generation. To our best
+knowledge, no prior work has demonstrated the effectiveness of ZO optimization
+in training deep neural networks (DNNs) without a significant decrease in
+performance. To overcome this roadblock, we develop DeepZero, a principled ZO
+deep learning (DL) framework that can scale ZO optimization to DNN training
+from scratch through three primary innovations. First, we demonstrate the
+advantages of coordinate-wise gradient estimation (CGE) over randomized
+vector-wise gradient estimation in training accuracy and computational
+efficiency. Second, we propose a sparsity-induced ZO training protocol that
+extends the model pruning methodology using only finite differences to explore
+and exploit the sparse DL prior in CGE. Third, we develop the methods of
+feature reuse and forward parallelization to advance the practical
+implementations of ZO training. Our extensive experiments show that DeepZero
+achieves state-of-the-art (SOTA) accuracy on ResNet-20 trained on CIFAR-10,
+approaching FO training performance for the first time. Furthermore, we show
+the practical utility of DeepZero in applications of certified adversarial
+defense and DL-based partial differential equation error correction, achieving
+10-20% improvement over SOTA. We believe our results will inspire future
+research on scalable ZO optimization and contribute to advancing DL with black
+box.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nash Regret Guarantees for Linear Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Sawarni, Soumybrata Pal, Siddharth Barman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We obtain essentially tight upper bounds for a strengthened notion of regret
+in the stochastic linear bandits framework. The strengthening -- referred to as
+Nash regret -- is defined as the difference between the (a priori unknown)
+optimum and the geometric mean of expected rewards accumulated by the linear
+bandit algorithm. Since the geometric mean corresponds to the well-studied Nash
+social welfare (NSW) function, this formulation quantifies the performance of a
+bandit algorithm as the collective welfare it generates across rounds. NSW is
+known to satisfy fairness axioms and, hence, an upper bound on Nash regret
+provides a principled fairness guarantee.
+  We consider the stochastic linear bandits problem over a horizon of $T$
+rounds and with set of arms ${X}$ in ambient dimension $d$. Furthermore, we
+focus on settings in which the stochastic reward -- associated with each arm in
+${X}$ -- is a non-negative, $\nu$-sub-Poisson random variable. For this
+setting, we develop an algorithm that achieves a Nash regret of $O\left(
+\sqrt{\frac{d\nu}{T}} \log( T |X|)\right)$. In addition, addressing linear
+bandit instances in which the set of arms ${X}$ is not necessarily finite, we
+obtain a Nash regret upper bound of $O\left(
+\frac{d^\frac{5}{4}\nu^{\frac{1}{2}}}{\sqrt{T}} \log(T)\right)$. Since bounded
+random variables are sub-Poisson, these results hold for bounded, positive
+rewards. Our linear bandit algorithm is built upon the successive elimination
+method with novel technical insights, including tailored concentration bounds
+and the use of sampling via John ellipsoid in conjunction with the
+Kiefer-Wolfowitz optimal design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ranking a Set of Objects using Heterogeneous Workers: QUITE an Easy
+  Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Nordio, Alberto tarable, Emilio Leonardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on the problem of ranking $N$ objects starting from a set of noisy
+pairwise comparisons provided by a crowd of unequal workers, each worker being
+characterized by a specific degree of reliability, which reflects her ability
+to rank pairs of objects. More specifically, we assume that objects are endowed
+with intrinsic qualities and that the probability with which an object is
+preferred to another depends both on the difference between the qualities of
+the two competitors and on the reliability of the worker. We propose QUITE, a
+non-adaptive ranking algorithm that jointly estimates workers' reliabilities
+and qualities of objects. Performance of QUITE is compared in different
+scenarios against previously proposed algorithms. Finally, we show how QUITE
+can be naturally made adaptive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectral operator learning for parametric PDEs without data reliance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junho Choi, Taehyun Yun, Namjung Kim, Youngjoon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce the Spectral Coefficient Learning via Operator
+Network (SCLON), a novel operator learning-based approach for solving
+parametric partial differential equations (PDEs) without the need for data
+harnessing. The cornerstone of our method is the spectral methodology that
+employs expansions using orthogonal functions, such as Fourier series and
+Legendre polynomials, enabling accurate PDE solutions with fewer grid points.
+By merging the merits of spectral methods - encompassing high accuracy,
+efficiency, generalization, and the exact fulfillment of boundary conditions -
+with the prowess of deep neural networks, SCLON offers a transformative
+strategy. Our approach not only eliminates the need for paired input-output
+training data, which typically requires extensive numerical computations, but
+also effectively learns and predicts solutions of complex parametric PDEs,
+ranging from singularly perturbed convection-diffusion equations to the
+Navier-Stokes equations. The proposed framework demonstrates superior
+performance compared to existing scientific machine learning techniques,
+offering solutions for multiple instances of parametric PDEs without harnessing
+data. The mathematical framework is robust and reliable, with a well-developed
+loss function derived from the weak formulation, ensuring accurate
+approximation of solutions while exactly satisfying boundary conditions. The
+method's efficacy is further illustrated through its ability to accurately
+predict intricate natural behaviors like the Kolmogorov flow and boundary
+layers. In essence, our work pioneers a compelling avenue for parametric PDE
+solutions, serving as a bridge between traditional numerical methodologies and
+cutting-edge machine learning techniques in the realm of scientific
+computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Training Without Depth Limits: Batch Normalization Without
+  Gradient Explosion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandru Meterez, Amir Joudaki, Francesco Orabona, Alexander Immer, Gunnar Rätsch, Hadi Daneshmand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Normalization layers are one of the key building blocks for deep neural
+networks. Several theoretical studies have shown that batch normalization
+improves the signal propagation, by avoiding the representations from becoming
+collinear across the layers. However, results on mean-field theory of batch
+normalization also conclude that this benefit comes at the expense of exploding
+gradients in depth. Motivated by these two aspects of batch normalization, in
+this study we pose the following question: "Can a batch-normalized network keep
+the optimal signal propagation properties, but avoid exploding gradients?" We
+answer this question in the affirmative by giving a particular construction of
+an Multi-Layer Perceptron (MLP) with linear activations and batch-normalization
+that provably has bounded gradients at any depth. Based on Weingarten calculus,
+we develop a rigorous and non-asymptotic theory for this constructed MLP that
+gives a precise characterization of forward signal propagation, while proving
+that gradients remain bounded for linearly independent input samples, which
+holds in most practical settings. Inspired by our theory, we also design an
+activation shaping scheme that empirically achieves the same properties for
+certain non-linear activations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Human Activities: Analyzing Wearable Accelerometer and
+  Gyroscope Data for Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Utsab Saha, Sawradip Saha, Tahmid Kabir, Shaikh Anowarul Fattah, Mohammad Saquib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A person's movement or relative positioning effectively generates raw
+electrical signals that can be read by computing machines to apply various
+manipulative techniques for the classification of different human activities.
+In this paper, a stratified multi-structural approach based on a Residual
+network ensembled with Residual MobileNet is proposed, termed as FusionActNet.
+The proposed method involves using carefully designed Residual blocks for
+classifying the static and dynamic activities separately because they have
+clear and distinct characteristics that set them apart. These networks are
+trained independently, resulting in two specialized and highly accurate models.
+These models excel at recognizing activities within a specific superclass by
+taking advantage of the unique algorithmic benefits of architectural
+adjustments. Afterward, these two ResNets are passed through a weighted
+ensemble-based Residual MobileNet. Subsequently, this ensemble proficiently
+discriminates between a specific static and a specific dynamic activity, which
+were previously identified based on their distinct feature characteristics in
+the earlier stage. The proposed model is evaluated using two publicly
+accessible datasets; namely, UCI HAR and Motion-Sense. Therein, it successfully
+handled the highly confusing cases of data overlap. Therefore, the proposed
+approach achieves a state-of-the-art accuracy of 96.71% and 95.35% in the UCI
+HAR and Motion-Sense datasets respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ fmeffects: An R Package for Forward Marginal Effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Holger Löwe, Christian A. Scholbeck, Christian Heumann, Bernd Bischl, Giuseppe Casalicchio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forward marginal effects (FMEs) have recently been introduced as a versatile
+and effective model-agnostic interpretation method. They provide comprehensible
+and actionable model explanations in the form of: If we change $x$ by an amount
+$h$, what is the change in predicted outcome $\widehat{y}$? We present the R
+package fmeffects, the first software implementation of FMEs. The relevant
+theoretical background, package functionality and handling, as well as the
+software design and options for future extensions are discussed in this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MUSCLE: Multi-task <span class="highlight-title">Self-supervised</span> Continual Learning to <span class="highlight-title">Pre-train</span> Deep
+  Models for X-ray Images of Multiple Body Parts <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weibin Liao, Haoyi Xiong, Qingzhong Wang, Yan Mo, Xuhong Li, Yi Liu, Zeyu Chen, Siyu Huang, Dejing Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While self-supervised learning (SSL) algorithms have been widely used to
+pre-train deep models, few efforts [11] have been done to improve
+representation learning of X-ray image analysis with SSL pre-trained models. In
+this work, we study a novel self-supervised pre-training pipeline, namely
+Multi-task Self-super-vised Continual Learning (MUSCLE), for multiple medical
+imaging tasks, such as classification and segmentation, using X-ray images
+collected from multiple body parts, including heads, lungs, and bones.
+Specifically, MUSCLE aggregates X-rays collected from multiple body parts for
+MoCo-based representation learning, and adopts a well-designed continual
+learning (CL) procedure to further pre-train the backbone subject various X-ray
+analysis tasks jointly. Certain strategies for image pre-processing, learning
+schedules, and regularization have been used to solve data heterogeneity,
+overfitting, and catastrophic forgetting problems for multi-task/dataset
+learning in MUSCLE.We evaluate MUSCLE using 9 real-world X-ray datasets with
+various tasks, including pneumonia classification, skeletal abnormality
+classification, lung segmentation, and tuberculosis (TB) detection. Comparisons
+against other pre-trained models [7] confirm the proof-of-concept that
+self-supervised multi-task/dataset continual pre-training could boost the
+performance of X-ray image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by Medical Image Computing and Computer Assisted
+  Intervention (MICCAI) 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fill in the Blank: Exploring and Enhancing LLM Capabilities for Backward
+  Reasoning in Math Word Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Deb, Neeva Oza, Sarthak Singla, Dinesh Khandelwal, Dinesh Garg, Parag Singla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While forward reasoning (i.e. find the answer given the question) has been
+explored extensively in the recent literature, backward reasoning is relatively
+unexplored. We examine the backward reasoning capabilities of LLMs on Math Word
+Problems (MWPs): given a mathematical question and its answer, with some
+details omitted from the question, can LLMs effectively retrieve the missing
+information?
+  In this paper, we formally define the backward reasoning task on math word
+problems and modify three datasets to evaluate this task: GSM8k, SVAMP and
+MultiArith. Our findings show a significant drop in the accuracy of models on
+backward reasoning compared to forward reasoning across four SOTA LLMs (GPT4,
+GPT3.5, PaLM-2, and LLaMa-2). Utilizing the specific format of this task, we
+propose three novel techniques that improve performance: Rephrase reformulates
+the given problem into a forward reasoning problem, PAL-Tools combines the idea
+of Program-Aided LLMs to produce a set of equations that can be solved by an
+external solver, and Check your Work exploits the availability of natural
+verifier of high accuracy in the forward direction, interleaving solving and
+verification steps. Finally, realizing that each of our base methods correctly
+solves a different set of problems, we propose a novel Bayesian formulation for
+creating an ensemble over these base methods aided by a verifier to further
+boost the accuracy by a significant margin. Extensive experimentation
+demonstrates that our techniques successively improve the performance of LLMs
+on the backward reasoning task, with the final ensemble-based method resulting
+in a substantial performance gain compared to the raw LLMs with standard
+prompting techniques such as chain-of-thought.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benign Overfitting in Two-Layer ReLU Convolutional Neural Networks for
+  XOR Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuran Meng, Difan Zou, Yuan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep learning models are usually highly over-parameterized so that
+they can overfit the training data. Surprisingly, such overfitting neural
+networks can usually still achieve high prediction accuracy. To study this
+"benign overfitting" phenomenon, a line of recent works has theoretically
+studied the learning of linear models and two-layer neural networks. However,
+most of these analyses are still limited to the very simple learning problems
+where the Bayes-optimal classifier is linear. In this work, we investigate a
+class of XOR-type classification tasks with label-flipping noises. We show
+that, under a certain condition on the sample complexity and signal-to-noise
+ratio, an over-parameterized ReLU CNN trained by gradient descent can achieve
+near Bayes-optimal accuracy. Moreover, we also establish a matching lower bound
+result showing that when the previous condition is not satisfied, the
+prediction accuracy of the obtained CNN is an absolute constant away from the
+Bayes-optimal rate. Our result demonstrates that CNNs have a remarkable
+capacity to efficiently learn XOR problems, even in the presence of highly
+correlated features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>74 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Wasserstein Distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alain Rakotomamonjy, Kimia Nadjahi, Liva Ralaivola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a principled way of computing the Wasserstein distance between
+two distributions in a federated manner. Namely, we show how to estimate the
+Wasserstein distance between two samples stored and kept on different
+devices/clients whilst a central entity/server orchestrates the computations
+(again, without having access to the samples). To achieve this feat, we take
+advantage of the geometric properties of the Wasserstein distance -- in
+particular, the triangle inequality -- and that of the associated {\em
+geodesics}: our algorithm, FedWad (for Federated Wasserstein Distance),
+iteratively approximates the Wasserstein distance by manipulating and
+exchanging distributions from the space of geodesics in lieu of the input
+samples. In addition to establishing the convergence properties of FedWad, we
+provide empirical results on federated coresets and federate optimal transport
+dataset distance, that we respectively exploit for building a novel federated
+model and for boosting performance of popular federated learning algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Epidemic Learning: Boosting Decentralized Learning with Randomized
+  Communication <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martijn de Vos, Sadegh Farhadkhani, Rachid Guerraoui, Anne-Marie Kermarrec, Rafael Pires, Rishi Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Epidemic Learning (EL), a simple yet powerful decentralized
+learning (DL) algorithm that leverages changing communication topologies to
+achieve faster model convergence compared to conventional DL approaches. At
+each round of EL, each node sends its model updates to a random sample of $s$
+other nodes (in a system of $n$ nodes). We provide an extensive theoretical
+analysis of EL, demonstrating that its changing topology culminates in superior
+convergence properties compared to the state-of-the-art (static and dynamic)
+topologies. Considering smooth non-convex loss functions, the number of
+transient iterations for EL, i.e., the rounds required to achieve asymptotic
+linear speedup, is in $\mathcal{O}(\frac{n^3}{s^2})$ which outperforms the
+best-known bound $\mathcal{O}({n^3})$ by a factor of $ s^2 $, indicating the
+benefit of randomized communication for DL. We empirically evaluate EL in a
+96-node network and compare its performance with state-of-the-art DL
+approaches. Our results illustrate that EL converges up to $ 1.6\times $
+quicker than baseline DL algorithms and attains 1.8% higher accuracy for the
+same communication volume.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted paper at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Labeling Oracles: What does it mean to steal ML models? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avital Shafran, Ilia Shumailov, Murat A. Erdogdu, Nicolas Papernot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model extraction attacks are designed to steal trained models with only query
+access, as is often provided through APIs that ML-as-a-Service providers offer.
+ML models are expensive to train, in part because data is hard to obtain, and a
+primary incentive for model extraction is to acquire a model while incurring
+less cost than training from scratch. Literature on model extraction commonly
+claims or presumes that the attacker is able to save on both data acquisition
+and labeling costs. We show that the attacker often does not. This is because
+current attacks implicitly rely on the adversary being able to sample from the
+victim model's data distribution. We thoroughly evaluate factors influencing
+the success of model extraction. We discover that prior knowledge of the
+attacker, i.e. access to in-distribution data, dominates other factors like the
+attack policy the adversary follows to choose which queries to make to the
+victim model API. Thus, an adversary looking to develop an equally capable
+model with a fixed budget has little practical incentive to perform model
+extraction, since for the attack to work they need to collect in-distribution
+data, saving only on the cost of labeling. With low labeling costs in the
+current market, the usefulness of such attacks is questionable. Ultimately, we
+demonstrate that the effect of prior knowledge needs to be explicitly decoupled
+from the attack policy. To this end, we propose a benchmark to evaluate attack
+policy directly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Reach-Avoid for Bayesian Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Wicker, Luca Laurenti, Andrea Patane, Nicola Paoletti, Alessandro Abate, Marta Kwiatkowska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based reinforcement learning seeks to simultaneously learn the dynamics
+of an unknown stochastic environment and synthesise an optimal policy for
+acting in it. Ensuring the safety and robustness of sequential decisions made
+through a policy in such an environment is a key challenge for policies
+intended for safety-critical scenarios. In this work, we investigate two
+complementary problems: first, computing reach-avoid probabilities for
+iterative predictions made with dynamical models, with dynamics described by
+Bayesian neural network (BNN); second, synthesising control policies that are
+optimal with respect to a given reach-avoid specification (reaching a "target"
+state, while avoiding a set of "unsafe" states) and a learned BNN model. Our
+solution leverages interval propagation and backward recursion techniques to
+compute lower bounds for the probability that a policy's sequence of actions
+leads to satisfying the reach-avoid specification. Such computed lower bounds
+provide safety certification for the given policy and BNN model. We then
+introduce control synthesis algorithms to derive policies maximizing said lower
+bounds on the safety probability. We demonstrate the effectiveness of our
+method on a series of control benchmarks characterized by learned BNN dynamics
+models. On our most challenging benchmark, compared to purely data-driven
+policies the optimal synthesis algorithm is able to provide more than a
+four-fold increase in the number of certifiable states and more than a
+three-fold increase in the average guaranteed reach-avoid probability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>47 pages, 10 figures. arXiv admin note: text overlap with
+  arXiv:2105.10134</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OOD Aware Supervised Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Seifi, Daniel Olmeda Reino, Nikolay Chumerin, Rahaf Aljundi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-Distribution (OOD) detection is a crucial problem for the safe
+deployment of machine learning models identifying samples that fall outside of
+the training distribution, i.e. in-distribution data (ID). Most OOD works focus
+on the classification models trained with Cross Entropy (CE) and attempt to fix
+its inherent issues. In this work we leverage powerful representation learned
+with Supervised Contrastive (SupCon) training and propose a holistic approach
+to learn a classifier robust to OOD data. We extend SupCon loss with two
+additional contrast terms. The first term pushes auxiliary OOD representations
+away from ID representations without imposing any constraints on similarities
+among auxiliary data. The second term pushes OOD features far from the existing
+class prototypes, while pushing ID representations closer to their
+corresponding class prototype. When auxiliary OOD data is not available, we
+propose feature mixing techniques to efficiently generate pseudo-OOD features.
+Our solution is simple and efficient and acts as a natural extension of the
+closed-set supervised contrastive representation learning. We compare against
+different OOD detection methods on the common benchmarks and show
+state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Inference with Conditional Front-Door Adjustment and Identifiable
+  Variational Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Xu, Debo Cheng, Jiuyong Li, Jixue Liu, Lin Liu, Kui Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An essential and challenging problem in causal inference is causal effect
+estimation from observational data. The problem becomes more difficult with the
+presence of unobserved confounding variables. The front-door adjustment is a
+practical approach for dealing with unobserved confounding variables. However,
+the restriction for the standard front-door adjustment is difficult to satisfy
+in practice. In this paper, we relax some of the restrictions by proposing the
+concept of conditional front-door (CFD) adjustment and develop the theorem that
+guarantees the causal effect identifiability of CFD adjustment. Furthermore, as
+it is often impossible for a CFD variable to be given in practice, it is
+desirable to learn it from data. By leveraging the ability of deep generative
+models, we propose CFDiVAE to learn the representation of the CFD adjustment
+variable directly from data with the identifiable Variational AutoEncoder and
+formally prove the model identifiability. Extensive experiments on synthetic
+datasets validate the effectiveness of CFDiVAE and its superiority over
+existing methods. The experiments also show that the performance of CFDiVAE is
+less sensitive to the causal strength of unobserved confounding variables. We
+further apply CFDiVAE to a real-world dataset to demonstrate its potential
+application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of
+  Text-To-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mor Ventura, Eyal Ben-David, Anna Korhonen, Roi Reichart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-To-Image (TTI) models, exemplified by DALL-E and StableDiffusion, have
+recently gained prominence for their remarkable zero-shot capabilities in
+generating images guided by textual prompts. Language, as a conduit of culture,
+plays a pivotal role in these models' multilingual capabilities, which in turn
+shape their cultural agency. In this study, we explore the cultural perception
+embedded in TTI models by characterizing culture across three hierarchical
+tiers: cultural dimensions, cultural domains, and cultural concepts. We propose
+a comprehensive suite of evaluation techniques, including intrinsic evaluations
+using the CLIP space, extrinsic evaluations with a Visual-Question-Answer (VQA)
+model, and human assessments, to discern TTI cultural perceptions. To
+facilitate our research, we introduce the CulText2I dataset, derived from four
+diverse TTI models and spanning ten languages. Our experiments reveal insights
+into these models' cultural awareness, cultural distinctions, and the unlocking
+of cultural features, releasing the potential for cross-cultural applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoFormer for Position Aware Multiple Instance Learning in Whole Slide
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Pochet, Rami Maroun, Roger Trullo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole slide image (WSI) classification is a critical task in computational
+pathology. However, the gigapixel-size of such images remains a major challenge
+for the current state of deep-learning. Current methods rely on
+multiple-instance learning (MIL) models with frozen feature extractors. Given
+the the high number of instances in each image, MIL methods have long assumed
+independence and permutation-invariance of patches, disregarding the tissue
+structure and correlation between patches. Recent works started studying this
+correlation between instances but the computational workload of such a high
+number of tokens remained a limiting factor. In particular, relative position
+of patches remains unaddressed. We propose to apply a straightforward encoding
+module, namely a RoFormer layer , relying on memory-efficient exact
+self-attention and relative positional encoding. This module can perform full
+self-attention with relative position encoding on patches of large and
+arbitrary shaped WSIs, solving the need for correlation between instances and
+spatial modeling of tissues. We demonstrate that our method outperforms
+state-of-the-art MIL models on three commonly used public datasets (TCGA-NSCLC,
+BRACS and Camelyon16)) on weakly supervised classification tasks. Code is
+available at https://github.com/Sanofi-Public/DDS-RoFormerMIL
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Automatic Diabetic Retinopathy Severity Classification Using
+  Deep Multimodal Fusion of UWF-CFP and OCTA Images <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mostafa El Habib Daho, Yihao Li, Rachid Zeghlache, Yapo Cedric Atse, Hugo Le Boité, Sophie Bonnin, Deborah Cosette, Pierre Deman, Laurent Borderie, Capucine Lepicard, Ramin Tadayoni, Béatrice Cochener, Pierre-Henri Conze, Mathieu Lamard, Gwenolé Quellec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diabetic Retinopathy (DR), a prevalent and severe complication of diabetes,
+affects millions of individuals globally, underscoring the need for accurate
+and timely diagnosis. Recent advancements in imaging technologies, such as
+Ultra-WideField Color Fundus Photography (UWF-CFP) imaging and Optical
+Coherence Tomography Angiography (OCTA), provide opportunities for the early
+detection of DR but also pose significant challenges given the disparate nature
+of the data they produce. This study introduces a novel multimodal approach
+that leverages these imaging modalities to notably enhance DR classification.
+Our approach integrates 2D UWF-CFP images and 3D high-resolution 6x6 mm$^3$
+OCTA (both structure and flow) images using a fusion of ResNet50 and
+3D-ResNet50 models, with Squeeze-and-Excitation (SE) blocks to amplify relevant
+features. Additionally, to increase the model's generalization capabilities, a
+multimodal extension of Manifold Mixup, applied to concatenated multimodal
+features, is implemented. Experimental results demonstrate a remarkable
+enhancement in DR classification performance with the proposed multimodal
+approach compared to methods relying on a single modality only. The methodology
+laid out in this work holds substantial promise for facilitating more accurate,
+early detection of DR, potentially improving clinical outcomes for patients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted preprint for presentation at MICCAI-OMIA 20023, Vancouver,
+  Canada</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond the Benchmark: Detecting Diverse Anomalies in Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoav Arad, Michael Werman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Anomaly Detection (VAD) plays a crucial role in modern surveillance
+systems, aiming to identify various anomalies in real-world situations.
+However, current benchmark datasets predominantly emphasize simple,
+single-frame anomalies such as novel object detection. This narrow focus
+restricts the advancement of VAD models. In this research, we advocate for an
+expansion of VAD investigations to encompass intricate anomalies that extend
+beyond conventional benchmark boundaries. To facilitate this, we introduce two
+datasets, HMDB-AD and HMDB-Violence, to challenge models with diverse
+action-based anomalies. These datasets are derived from the HMDB51 action
+recognition dataset. We further present Multi-Frame Anomaly Detection (MFAD), a
+novel method built upon the AI-VAD framework. AI-VAD utilizes single-frame
+features such as pose estimation and deep image encoding, and two-frame
+features such as object velocity. They then apply a density estimation
+algorithm to compute anomaly scores. To address complex multi-frame anomalies,
+we add a deep video encoding features capturing long-range temporal
+dependencies, and logistic regression to enhance final score calculation.
+Experimental results confirm our assumptions, highlighting existing models
+limitations with new anomaly types. MFAD excels in both simple and complex
+anomaly detection scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FiGURe: Simple and Efficient Unsupervised Node Representations with
+  Filter Augmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanakya Ekbote, Ajinkya Pankaj Deshpande, Arun Iyer, Ramakrishna Bairi, Sundararajan Sellamanickam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised node representations learnt using contrastive learning-based
+methods have shown good performance on downstream tasks. However, these methods
+rely on augmentations that mimic low-pass filters, limiting their performance
+on tasks requiring different eigen-spectrum parts. This paper presents a simple
+filter-based augmentation method to capture different parts of the
+eigen-spectrum. We show significant improvements using these augmentations.
+Further, we show that sharing the same weights across these different filter
+augmentations is possible, reducing the computational load. In addition,
+previous works have shown that good performance on downstream tasks requires
+high dimensional representations. Working with high dimensions increases the
+computations, especially when multiple augmentations are involved. We mitigate
+this problem and recover good performance through lower dimensional embeddings
+using simple random Fourier feature projections. Our method, FiGURe achieves an
+average gain of up to 4.4\%, compared to the state-of-the-art unsupervised
+models, across all datasets in consideration, both homophilic and heterophilic.
+Our code can be found at: https://github.com/microsoft/figure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective and Parameter-Efficient Reusing Fine-Tuned Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisen Jiang, Baijiong Lin, Han Shi, Yu Zhang, and Zhenguo Li, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many pre-trained large-scale models provided online have become highly
+effective in transferring to downstream tasks. At the same time, various
+task-specific models fine-tuned on these pre-trained models are available
+online for public use. In practice, as collecting task-specific data is
+labor-intensive and fine-tuning the large pre-trained models is computationally
+expensive, one can reuse task-specific finetuned models to deal with downstream
+tasks. However, using a model per task causes a heavy burden on storage and
+serving. Recently, many training-free and parameter-efficient methods have been
+proposed for reusing multiple fine-tuned task-specific models into a single
+multi-task model. However, these methods exhibit a large accuracy gap compared
+with using a fine-tuned model per task. In this paper, we propose
+Parameter-Efficient methods for ReUsing (PERU) fine-tuned models. For reusing
+Fully Fine-Tuned (FFT) models, we propose PERU-FFT by injecting a sparse task
+vector into a merged model by magnitude pruning. For reusing LoRA fine-tuned
+models, we propose PERU-LoRA use a lower-rank matrix to approximate the LoRA
+matrix by singular value decomposition. Both PERUFFT and PERU-LoRA are
+training-free. Extensive experiments conducted on computer vision and natural
+language process tasks demonstrate the effectiveness and parameter-efficiency
+of the proposed methods. The proposed PERU-FFT and PERU-LoRA outperform
+existing reusing model methods by a large margin and achieve comparable
+performance to using a fine-tuned model per task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic CT Generation via Variant Invertible Network for All-digital
+  Brain PET Attenuation Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Guan, Bohui Shen, Xinchong Shi, Xiangsong Zhang, Bingxuan Li, Qiegen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attenuation correction (AC) is essential for the generation of artifact-free
+and quantitatively accurate positron emission tomography (PET) images. However,
+AC of PET faces challenges including inter-scan motion and erroneous
+transformation of structural voxel-intensities to PET attenuation-correction
+factors. Nowadays, the problem of AC for quantitative PET have been solved to a
+large extent after the commercial availability of devices combining PET with
+computed tomography (CT). Meanwhile, considering the feasibility of a deep
+learning approach for PET AC without anatomical imaging, this paper develops a
+PET AC method, which uses deep learning to generate continuously valued CT
+images from non-attenuation corrected PET images for AC on brain PET imaging.
+Specifically, an invertible network combined with the variable augmentation
+strategy that can achieve the bidirectional inference processes is proposed for
+synthetic CT generation (IVNAC). To evaluate the performance of the proposed
+algorithm, we conducted a comprehensive study on a total of 1440 data from 37
+clinical patients using comparative algorithms (such as Cycle-GAN and Pix2pix).
+Perceptual analysis and quantitative evaluations illustrate that the invertible
+network for PET AC outperforms other existing AC models, which demonstrates the
+potential of the proposed method and the feasibility of achieving brain PET AC
+without CT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Hybrid Model for Enhanced Stock Market Predictions Using
+  Improved VMD and Stacked Informer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Zhang, Hongyi Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an innovative adaptive hybrid model for stock market
+predictions, leveraging the capabilities of an enhanced Variational Mode
+Decomposition (VMD), Feature Engineering (FE), and stacked Informer integrated
+with an adaptive loss function. Through rigorous experimentation, the proposed
+model, termed Adam+GC+enhanced informer (We name it VMGCformer), demonstrates
+significant proficiency in addressing the intricate dynamics and volatile
+nature of stock market data. Experimental results, derived from multiple
+benchmark datasets, underscore the model's superiority in terms of prediction
+accuracy, responsiveness, and generalization capabilities over traditional and
+other hybrid models. The research further highlights potential avenues for
+optimization and introduces future directions to enhance predictive modeling,
+especially for small enterprises and feature engineering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoCast++: Enhancing World Event Prediction with Zero-shot
+  Ranking-based Context Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Yan, Raihan Seraj, Jiawei He, Lili Meng, Tristan Sylvain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine-based prediction of real-world events is garnering attention due to
+its potential for informed decision-making. Whereas traditional forecasting
+predominantly hinges on structured data like time-series, recent breakthroughs
+in language models enable predictions using unstructured text. In particular,
+(Zou et al., 2022) unveils AutoCast, a new benchmark that employs news articles
+for answering forecasting queries. Nevertheless, existing methods still trail
+behind human performance. The cornerstone of accurate forecasting, we argue,
+lies in identifying a concise, yet rich subset of news snippets from a vast
+corpus. With this motivation, we introduce AutoCast++, a zero-shot
+ranking-based context retrieval system, tailored to sift through expansive news
+document collections for event forecasting. Our approach first re-ranks
+articles based on zero-shot question-passage relevance, honing in on
+semantically pertinent news. Following this, the chosen articles are subjected
+to zero-shot summarization to attain succinct context. Leveraging a pre-trained
+language model, we conduct both the relevance evaluation and article
+summarization without needing domain-specific training. Notably, recent
+articles can sometimes be at odds with preceding ones due to new facts or
+unanticipated incidents, leading to fluctuating temporal dynamics. To tackle
+this, our re-ranking mechanism gives preference to more recent articles, and we
+further regularize the multi-passage representation learning to align with
+human forecaster responses made on different dates. Empirical results
+underscore marked improvements across multiple metrics, improving the
+performance for multiple-choice questions (MCQ) by 48% and true/false (TF)
+questions by up to 8%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Stable Backdoor Purification through Feature Shift Tuning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Min, Zeyu Qin, Li Shen, Minhao Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been widely observed that deep neural networks (DNN) are vulnerable to
+backdoor attacks where attackers could manipulate the model behavior
+maliciously by tampering with a small set of training samples. Although a line
+of defense methods is proposed to mitigate this threat, they either require
+complicated modifications to the training process or heavily rely on the
+specific model architecture, which makes them hard to deploy into real-world
+applications. Therefore, in this paper, we instead start with fine-tuning, one
+of the most common and easy-to-deploy backdoor defenses, through comprehensive
+evaluations against diverse attack scenarios. Observations made through initial
+experiments show that in contrast to the promising defensive results on high
+poisoning rates, vanilla tuning methods completely fail at low poisoning rate
+scenarios. Our analysis shows that with the low poisoning rate, the
+entanglement between backdoor and clean features undermines the effect of
+tuning-based defenses. Therefore, it is necessary to disentangle the backdoor
+and clean features in order to improve backdoor purification. To address this,
+we introduce Feature Shift Tuning (FST), a method for tuning-based backdoor
+purification. Specifically, FST encourages feature shifts by actively deviating
+the classifier weights from the originally compromised weights. Extensive
+experiments demonstrate that our FST provides consistently stable performance
+under different attack settings. Additionally, it is also convenient to deploy
+in real-world scenarios with significantly reduced computation costs. Our codes
+are available at
+\url{https://github.com/AISafety-HKUST/stable_backdoor_purification}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 paper. The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepDecipher: Accessing and Investigating Neuron Activation in Large
+  Language Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Garde, Esben Kran, Fazl Barez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) become more capable, there is an urgent need
+for interpretable and transparent tools. Current methods are difficult to
+implement, and accessible tools to analyze model internals are lacking. To
+bridge this gap, we present DeepDecipher - an API and interface for probing
+neurons in transformer models' MLP layers. DeepDecipher makes the outputs of
+advanced interpretability techniques for LLMs readily available. The
+easy-to-use interface also makes inspecting these complex models more
+intuitive. This paper outlines DeepDecipher's design and capabilities. We
+demonstrate how to analyze neurons, compare models, and gain insights into
+model behavior. For example, we contrast DeepDecipher's functionality with
+similar tools like Neuroscope and OpenAI's Neuron Explainer. DeepDecipher
+enables efficient, scalable analysis of LLMs. By granting access to
+state-of-the-art interpretability methods, DeepDecipher makes LLMs more
+transparent, trustworthy, and safe. Researchers, engineers, and developers can
+quickly diagnose issues, audit systems, and advance the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages (9 total), 1 figure, submitted to NeurIPS 2023 Workshop XAIA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Instrumental Variable Regression with Representation
+  Learning for Causal Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debo Cheng, Ziqi Xu, Jiuyong Li, Lin Liu, Jixue Liu, Thuc Duy Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the challenging problem of estimating causal effects from
+observational data, in the presence of unobserved confounders. The two-stage
+least square (TSLS) method and its variants with a standard instrumental
+variable (IV) are commonly used to eliminate confounding bias, including the
+bias caused by unobserved confounders, but they rely on the linearity
+assumption. Besides, the strict condition of unconfounded instruments posed on
+a standard IV is too strong to be practical. To address these challenging and
+practical problems of the standard IV method (linearity assumption and the
+strict condition), in this paper, we use a conditional IV (CIV) to relax the
+unconfounded instrument condition of standard IV and propose a non-linear CIV
+regression with Confounding Balancing Representation Learning, CBRL.CIV, for
+jointly eliminating the confounding bias from unobserved confounders and
+balancing the observed confounders, without the linearity assumption. We
+theoretically demonstrate the soundness of CBRL.CIV. Extensive experiments on
+synthetic and two real-world datasets show the competitive performance of
+CBRL.CIV against state-of-the-art IV-based estimators and superiority in
+dealing with the non-linear situation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17pages, 3 figures and 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Probability Convergence for Composite and Distributed Stochastic
+  Minimization and Variational Inequalities with Heavy-Tailed Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduard Gorbunov, Abdurakhmon Sadiev, Marina Danilova, Samuel Horváth, Gauthier Gidel, Pavel Dvurechensky, Alexander Gasnikov, Peter Richtárik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-probability analysis of stochastic first-order optimization methods
+under mild assumptions on the noise has been gaining a lot of attention in
+recent years. Typically, gradient clipping is one of the key algorithmic
+ingredients to derive good high-probability guarantees when the noise is
+heavy-tailed. However, if implemented na\"ively, clipping can spoil the
+convergence of the popular methods for composite and distributed optimization
+(Prox-SGD/Parallel SGD) even in the absence of any noise. Due to this reason,
+many works on high-probability analysis consider only unconstrained
+non-distributed problems, and the existing results for composite/distributed
+problems do not include some important special cases (like strongly convex
+problems) and are not optimal. To address this issue, we propose new stochastic
+methods for composite and distributed optimization based on the clipping of
+stochastic gradient differences and prove tight high-probability convergence
+results (including nearly optimal ones) for the new methods. Using similar
+ideas, we also develop new methods for composite and distributed variational
+inequalities and analyze the high-probability convergence of these methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>143 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Gaussian approximation of the Kushner optimal filter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Lambert, Silvère Bonnabel, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In estimation theory, the Kushner equation provides the evolution of the
+probability density of the state of a dynamical system given continuous-time
+observations. Building upon our recent work, we propose a new way to
+approximate the solution of the Kushner equation through tractable variational
+Gaussian approximations of two proximal losses associated with the propagation
+and Bayesian update of the probability density. The first is a proximal loss
+based on the Wasserstein metric and the second is a proximal loss based on the
+Fisher metric. The solution to this last proximal loss is given by implicit
+updates on the mean and covariance that we proposed earlier. These two
+variational updates can be fused and shown to satisfy a set of stochastic
+differential equations on the Gaussian's mean and covariance matrix. This
+Gaussian flow is consistent with the Kalman-Bucy and Riccati flows in the
+linear case and generalize them in the nonlinear one.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Lecture Notes in Computer Science, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Score-based Data Assimilation for a Two-Layer Quasi-Geostrophic Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        François Rozet, Gilles Louppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data assimilation addresses the problem of identifying plausible state
+trajectories of dynamical systems given noisy or incomplete observations. In
+geosciences, it presents challenges due to the high-dimensionality of
+geophysical dynamical systems, often exceeding millions of dimensions. This
+work assesses the scalability of score-based data assimilation (SDA), a novel
+data assimilation method, in the context of such systems. We propose
+modifications to the score network architecture aimed at significantly reducing
+memory consumption and execution time. We demonstrate promising results for a
+two-layer quasi-geostrophic model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking and Improving Generator-Validator Consistency of Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Lisa Li, Vaishnavi Shrivastava, Siyan Li, Tatsunori Hashimoto, Percy Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As of September 2023, ChatGPT correctly answers "what is 7+8" with 15, but
+when asked "7+8=15, True or False" it responds with "False". This inconsistency
+between generating and validating an answer is prevalent in language models
+(LMs) and erodes trust. In this paper, we propose a framework for measuring the
+consistency between generation and validation (which we call
+generator-validator consistency, or GV-consistency), finding that even GPT-4, a
+state-of-the-art LM, is GV-consistent only 76% of the time. To improve the
+consistency of LMs, we propose to finetune on the filtered generator and
+validator responses that are GV-consistent, and call this approach consistency
+fine-tuning. We find that this approach improves GV-consistency of Alpaca-30B
+from 60% to 93%, and the improvement extrapolates to unseen tasks and domains
+(e.g., GV-consistency for positive style transfers extrapolates to unseen
+styles like humor). In addition to improving consistency, consistency
+fine-tuning improves both generator quality and validator accuracy without
+using any labeled data. Evaluated across 6 tasks, including math questions,
+knowledge-intensive QA, and instruction following, our method improves the
+generator quality by 16% and the validator accuracy by 6.3% across all tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Refinement of Buildings' Segmentation Models using SAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Mayladan, Hasan Nasrallah, Hasan Moughnieh, Mustafa Shukor, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have excelled in various tasks but are often evaluated on
+general benchmarks. The adaptation of these models for specific domains, such
+as remote sensing imagery, remains an underexplored area. In remote sensing,
+precise building instance segmentation is vital for applications like urban
+planning. While Convolutional Neural Networks (CNNs) perform well, their
+generalization can be limited. For this aim, we present a novel approach to
+adapt foundation models to address existing models' generalization dropback.
+Among several models, our focus centers on the Segment Anything Model (SAM), a
+potent foundation model renowned for its prowess in class-agnostic image
+segmentation capabilities. We start by identifying the limitations of SAM,
+revealing its suboptimal performance when applied to remote sensing imagery.
+Moreover, SAM does not offer recognition abilities and thus fails to classify
+and tag localized objects. To address these limitations, we introduce different
+prompting strategies, including integrating a pre-trained CNN as a prompt
+generator. This novel approach augments SAM with recognition abilities, a first
+of its kind. We evaluated our method on three remote sensing datasets,
+including the WHU Buildings dataset, the Massachusetts Buildings dataset, and
+the AICrowd Mapping Challenge. For out-of-distribution performance on the WHU
+dataset, we achieve a 5.47% increase in IoU and a 4.81% improvement in
+F1-score. For in-distribution performance on the WHU dataset, we observe a
+2.72% and 1.58% increase in True-Positive-IoU and True-Positive-F1 score,
+respectively. We intend to release our code repository, hoping to inspire
+further exploration of foundation models for domain-specific tasks within the
+remote sensing community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending CAM-based XAI methods for Remote Sensing Imagery Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Karim Gizzini, Mustafa Shukor, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current AI-based methods do not provide comprehensible physical
+interpretations of the utilized data, extracted features, and
+predictions/inference operations. As a result, deep learning models trained
+using high-resolution satellite imagery lack transparency and explainability
+and can be merely seen as a black box, which limits their wide-level adoption.
+Experts need help understanding the complex behavior of AI models and the
+underlying decision-making process. The explainable artificial intelligence
+(XAI) field is an emerging field providing means for robust, practical, and
+trustworthy deployment of AI models. Several XAI techniques have been proposed
+for image classification tasks, whereas the interpretation of image
+segmentation remains largely unexplored. This paper offers to bridge this gap
+by adapting the recent XAI classification algorithms and making them usable for
+muti-class image segmentation, where we mainly focus on buildings' segmentation
+from high-resolution satellite images. To benchmark and compare the performance
+of the proposed approaches, we introduce a new XAI evaluation methodology and
+metric based on "Entropy" to measure the model uncertainty. Conventional XAI
+evaluation methods rely mainly on feeding area-of-interest regions from the
+image back to the pre-trained (utility) model and then calculating the average
+change in the probability of the target class. Those evaluation metrics lack
+the needed robustness, and we show that using Entropy to monitor the model
+uncertainty in segmenting the pixels within the target class is more suitable.
+We hope this work will pave the way for additional XAI research for image
+segmentation and applications in the remote sensing discipline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EMBERSim: A Large-Scale Databank for Boosting Similarity Search in
+  Malware Analysis <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dragos Georgian Corlatescu, Alexandru Dinu, Mihaela Gaman, Paul Sumedrea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years there has been a shift from heuristics-based malware
+detection towards machine learning, which proves to be more robust in the
+current heavily adversarial threat landscape. While we acknowledge machine
+learning to be better equipped to mine for patterns in the increasingly high
+amounts of similar-looking files, we also note a remarkable scarcity of the
+data available for similarity-targeted research. Moreover, we observe that the
+focus in the few related works falls on quantifying similarity in malware,
+often overlooking the clean data. This one-sided quantification is especially
+dangerous in the context of detection bypass. We propose to address the
+deficiencies in the space of similarity research on binary files, starting from
+EMBER - one of the largest malware classification data sets. We enhance EMBER
+with similarity information as well as malware class tags, to enable further
+research in the similarity space. Our contribution is threefold: (1) we publish
+EMBERSim, an augmented version of EMBER, that includes similarity-informed
+tags; (2) we enrich EMBERSim with automatically determined malware class tags
+using the open-source tool AVClass on VirusTotal data and (3) we describe and
+share the implementation for our class scoring technique and leaf similarity
+method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 37th Conference on Neural Information Processing
+  Systems (NeurIPS 2023) Track on Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trainable Noise Model as an XAI evaluation method: application on Sobol
+  for remote sensing image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Shreim, Abdul Karim Gizzini, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  eXplainable Artificial Intelligence (XAI) has emerged as an essential
+requirement when dealing with mission-critical applications, ensuring
+transparency and interpretability of the employed black box AI models. The
+significance of XAI spans various domains, from healthcare to finance, where
+understanding the decision-making process of deep learning algorithms is
+essential. Most AI-based computer vision models are often black boxes; hence,
+providing explainability of deep neural networks in image processing is crucial
+for their wide adoption and deployment in medical image analysis, autonomous
+driving, and remote sensing applications. Recently, several XAI methods for
+image classification tasks have been introduced. On the contrary, image
+segmentation has received comparatively less attention in the context of
+explainability, although it is a fundamental task in computer vision
+applications, especially in remote sensing. Only some research proposes
+gradient-based XAI algorithms for image segmentation. This paper adapts the
+recent gradient-free Sobol XAI method for semantic segmentation. To measure the
+performance of the Sobol method for segmentation, we propose a quantitative XAI
+evaluation method based on a learnable noise model. The main objective of this
+model is to induce noise on the explanation maps, where higher induced noise
+signifies low accuracy and vice versa. A benchmark analysis is conducted to
+evaluate and compare performance of three XAI methods, including Seg-Grad-CAM,
+Seg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation
+technique. This constitutes the first attempt to run and evaluate XAI methods
+using high-resolution satellite images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Hasan Zahweh, Hasan Nasrallah, Mustafa Shukor, Ghaleb Faour, Ali J. Ghandour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced
+significant growth and have been extensively employed to adapt large vision and
+language models to various domains, enabling satisfactory model performance
+with minimal computational needs. Despite these advances, more research has yet
+to delve into potential PEFT applications in real-life scenarios, particularly
+in the critical domains of remote sensing and crop monitoring. The diversity of
+climates across different regions and the need for comprehensive large-scale
+datasets have posed significant obstacles to accurately identify crop types
+across varying geographic locations and changing growing seasons. This study
+seeks to bridge this gap by comprehensively exploring the feasibility of
+cross-area and cross-year out-of-distribution generalization using the
+State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to
+explore PEFT approaches for crop monitoring. Specifically, we focus on adapting
+the SOTA TSViT model to address winter wheat field segmentation, a critical
+task for crop monitoring and food security. This adaptation process involves
+integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and
+prompt tuning. Using PEFT techniques, we achieved notable results comparable to
+those achieved using full fine-tuning methods while training only a mere 0.7%
+parameters of the whole TSViT architecture. The in-house labeled data-set,
+referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated
+polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over
+five consecutive years. Using Sentinel-2 images, our model achieved a 84%
+F1-score. We intend to publicly release the Lebanese winter wheat data set,
+code repository, and model weights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini-BEHAVIOR: A Procedurally Generated Benchmark for Long-horizon
+  Decision-Making in Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emily Jin, Jiaheng Hu, Zhuoyi Huang, Ruohan Zhang, Jiajun Wu, Li Fei-Fei, Roberto Martín-Martín
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Mini-BEHAVIOR, a novel benchmark for embodied AI that challenges
+agents to use reasoning and decision-making skills to solve complex activities
+that resemble everyday human challenges. The Mini-BEHAVIOR environment is a
+fast, realistic Gridworld environment that offers the benefits of rapid
+prototyping and ease of use while preserving a symbolic level of physical
+realism and complexity found in complex embodied AI benchmarks. We introduce
+key features such as procedural generation, to enable the creation of countless
+task variations and support open-ended learning. Mini-BEHAVIOR provides
+implementations of various household tasks from the original BEHAVIOR
+benchmark, along with starter code for data collection and reinforcement
+learning agent training. In essence, Mini-BEHAVIOR offers a fast, open-ended
+benchmark for evaluating decision-making and planning solutions in embodied AI.
+It serves as a user-friendly entry point for research and facilitates the
+evaluation and development of solutions, simplifying their assessment and
+development while advancing the field of embodied AI. Code is publicly
+available at https://github.com/StanfordVL/mini_behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuhiro Kaneko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields (NeRFs) have shown impressive results for novel view
+synthesis. However, they depend on the repetitive use of a single-input
+single-output multilayer perceptron (SISO MLP) that maps 3D coordinates and
+view direction to the color and volume density in a sample-wise manner, which
+slows the rendering. We propose a multi-input multi-output NeRF (MIMO-NeRF)
+that reduces the number of MLPs running by replacing the SISO MLP with a MIMO
+MLP and conducting mappings in a group-wise manner. One notable challenge with
+this approach is that the color and volume density of each point can differ
+according to a choice of input coordinates in a group, which can lead to some
+notable ambiguity. We also propose a self-supervised learning method that
+regularizes the MIMO MLP with multiple fast reformulated MLPs to alleviate this
+ambiguity without using pretrained models. The results of a comprehensive
+experimental evaluation including comparative and ablation studies are
+presented to show that MIMO-NeRF obtains a good trade-off between speed and
+quality with a reasonable training time. We then demonstrate that MIMO-NeRF is
+compatible with and complementary to previous advancements in NeRFs by applying
+it to two representative fast NeRFs, i.e., a NeRF with sample reduction
+(DONeRF) and a NeRF with alternative representations (TensoRF).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Project page:
+  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/mimo-nerf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Robust Fidelity for Evaluating Explainability of Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Zheng, Farhad Shirani, Tianchun Wang, Wei Cheng, Zhuomin Chen, Haifeng Chen, Hua Wei, Dongsheng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are neural models that leverage the dependency
+structure in graphical data via message passing among the graph nodes. GNNs
+have emerged as pivotal architectures in analyzing graph-structured data, and
+their expansive application in sensitive domains requires a comprehensive
+understanding of their decision-making processes -- necessitating a framework
+for GNN explainability. An explanation function for GNNs takes a pre-trained
+GNN along with a graph as input, to produce a `sufficient statistic' subgraph
+with respect to the graph label. A main challenge in studying GNN
+explainability is to provide fidelity measures that evaluate the performance of
+these explanation functions. This paper studies this foundational challenge,
+spotlighting the inherent limitations of prevailing fidelity metrics, including
+$Fid_+$, $Fid_-$, and $Fid_\Delta$. Specifically, a formal,
+information-theoretic definition of explainability is introduced and it is
+shown that existing metrics often fail to align with this definition across
+various statistical scenarios. The reason is due to potential distribution
+shifts when subgraphs are removed in computing these fidelity measures.
+Subsequently, a robust class of fidelity measures are introduced, and it is
+shown analytically that they are resilient to distribution shift issues and are
+applicable in a wide range of scenarios. Extensive empirical analysis on both
+synthetic and real datasets are provided to illustrate that the proposed
+metrics are more coherent with gold standard metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 Pages, 10 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoLoRa: A Parameter-Free Automated Robust Fine-Tuning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xilie Xu, Jingfeng Zhang, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust Fine-Tuning (RFT) is a low-cost strategy to obtain adversarial
+robustness in downstream applications, without requiring a lot of computational
+resources and collecting significant amounts of data. This paper uncovers an
+issue with the existing RFT, where optimizing both adversarial and natural
+objectives through the feature extractor (FE) yields significantly divergent
+gradient directions. This divergence introduces instability in the optimization
+process, thereby hindering the attainment of adversarial robustness and
+rendering RFT highly sensitive to hyperparameters. To mitigate this issue, we
+propose a low-rank (LoRa) branch that disentangles RFT into two distinct
+components: optimizing natural objectives via the LoRa branch and adversarial
+objectives via the FE. Besides, we introduce heuristic strategies for
+automating the scheduling of the learning rate and the scalars of loss terms.
+Extensive empirical evaluations demonstrate that our proposed automated RFT
+disentangled via the LoRa branch (AutoLoRa) achieves new state-of-the-art
+results across a range of downstream tasks. AutoLoRa holds significant
+practical utility, as it automatically converts a pre-trained FE into an
+adversarially robust model for downstream tasks without the need for searching
+hyperparameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Determines the Price of NFTs? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivian Ziemke, Benjamin Estermann, Roger Wattenhofer, Ye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the evolving landscape of digital art, Non-Fungible Tokens (NFTs) have
+emerged as a groundbreaking platform, bridging the realms of art and
+technology. NFTs serve as the foundational framework that has revolutionized
+the market for digital art, enabling artists to showcase and monetize their
+creations in unprecedented ways. NFTs combine metadata stored on the blockchain
+with off-chain data, such as images, to create a novel form of digital
+ownership. It is not fully understood how these factors come together to
+determine NFT prices. In this study, we analyze both on-chain and off-chain
+data of NFT collections trading on OpenSea to understand what influences NFT
+pricing. Our results show that while text and image data of the NFTs can be
+used to explain price variations within collections, the extracted features do
+not generalize to new, unseen collections. Furthermore, we find that an NFT
+collection's trading volume often relates to its online presence, like social
+media followers and website traffic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulation-based Inference with the Generalized Kullback-Leibler
+  Divergence <span class="chip">ICML
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Kurt Miller, Marco Federici, Christoph Weniger, Patrick Forré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Simulation-based Inference, the goal is to solve the inverse problem when
+the likelihood is only known implicitly. Neural Posterior Estimation commonly
+fits a normalized density estimator as a surrogate model for the posterior.
+This formulation cannot easily fit unnormalized surrogates because it optimizes
+the Kullback-Leibler divergence. We propose to optimize a generalized
+Kullback-Leibler divergence that accounts for the normalization constant in
+unnormalized distributions. The objective recovers Neural Posterior Estimation
+when the model class is normalized and unifies it with Neural Ratio Estimation,
+combining both into a single objective. We investigate a hybrid model that
+offers the best of both worlds by learning a normalized base distribution and a
+learned ratio. We also present benchmark results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Synergy of Scientific and Machine Learning Modeling ICML
+  2023 Workshop https://syns-ml.github.io/2023/contributions/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Discrete, compositional, and symbolic representations through attractor
+  dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Nam, Eric Elmoznino, Nikolay Malkin, Chen Sun, <span class="highlight-author">Yoshua Bengio</span>, Guillaume Lajoie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositionality is an important feature of discrete symbolic systems, such
+as language and programs, as it enables them to have infinite capacity despite
+a finite symbol set. It serves as a useful abstraction for reasoning in both
+cognitive science and in AI, yet the interface between continuous and symbolic
+processing is often imposed by fiat at the algorithmic level, such as by means
+of quantization or a softmax sampling step. In this work, we explore how
+discretization could be implemented in a more neurally plausible manner through
+the modeling of attractor dynamics that partition the continuous representation
+space into basins that correspond to sequences of symbols. Building on
+established work in attractor networks and introducing novel training methods,
+we show that imposing structure in the symbolic space can produce
+compositionality in the attractor-supported representation space of rich
+sensory inputs. Lastly, we argue that our model exhibits the process of an
+information bottleneck that is thought to play a role in conscious experience,
+decomposing the rich information of a sensory input into stable components
+encoding symbolic information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GNNX-BENCH: Unravelling the Utility of Perturbation-based GNN Explainers
+  through In-depth Benchmarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mert Kosan, Samidha Verma, Burouj Armgaan, Khushbu Pahwa, Ambuj Singh, Sourav Medya, Sayan Ranu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous explainability methods have been proposed to shed light on the inner
+workings of GNNs. Despite the inclusion of empirical evaluations in all the
+proposed algorithms, the interrogative aspects of these evaluations lack
+diversity. As a result, various facets of explainability pertaining to GNNs,
+such as a comparative analysis of counterfactual reasoners, their stability to
+variational factors such as different GNN architectures, noise, stochasticity
+in non-convex loss surfaces, feasibility amidst domain constraints, and so
+forth, have yet to be formally investigated. Motivated by this need, we present
+a benchmarking study on perturbation-based explainability methods for GNNs,
+aiming to systematically evaluate and compare a wide range of explainability
+techniques. Among the key findings of our study, we identify the Pareto-optimal
+methods that exhibit superior efficacy and stability in the presence of noise.
+Nonetheless, our study reveals that all algorithms are affected by stability
+issues when faced with noisy data. Furthermore, we have established that the
+current generation of counterfactual explainers often fails to provide feasible
+recourses due to violations of topological constraints encoded by
+domain-specific considerations. Overall, this benchmarking study empowers
+stakeholders in the field of GNNs with a comprehensive understanding of the
+state-of-the-art explainability methods, potential research problems for
+further enhancement, and the implications of their application in real-world
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can large language models provide useful feedback on research papers? A
+  large-scale empirical analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixin Liang, Yuhui Zhang, Hancheng Cao, Binglu Wang, Daisy Ding, Xinyu Yang, Kailas Vodrahalli, Siyu He, Daniel Smith, Yian Yin, Daniel McFarland, James Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expert feedback lays the foundation of rigorous research. However, the rapid
+growth of scholarly production and intricate knowledge specialization challenge
+the conventional scientific feedback mechanisms. High-quality peer reviews are
+increasingly difficult to obtain. Researchers who are more junior or from
+under-resourced settings have especially hard times getting timely feedback.
+With the breakthrough of large language models (LLM) such as GPT-4, there is
+growing interest in using LLMs to generate scientific feedback on research
+manuscripts. However, the utility of LLM-generated feedback has not been
+systematically studied. To address this gap, we created an automated pipeline
+using GPT-4 to provide comments on the full PDFs of scientific papers. We
+evaluated the quality of GPT-4's feedback through two large-scale studies. We
+first quantitatively compared GPT-4's generated feedback with human peer
+reviewer feedback in 15 Nature family journals (3,096 papers in total) and the
+ICLR machine learning conference (1,709 papers). The overlap in the points
+raised by GPT-4 and by human reviewers (average overlap 30.85% for Nature
+journals, 39.23% for ICLR) is comparable to the overlap between two human
+reviewers (average overlap 28.58% for Nature journals, 35.25% for ICLR). The
+overlap between GPT-4 and human reviewers is larger for the weaker papers. We
+then conducted a prospective user study with 308 researchers from 110 US
+institutions in the field of AI and computational biology to understand how
+researchers perceive feedback generated by our GPT-4 system on their own
+papers. Overall, more than half (57.4%) of the users found GPT-4 generated
+feedback helpful/very helpful and 82.4% found it more beneficial than feedback
+from at least some human reviewers. While our findings show that LLM-generated
+feedback can help researchers, we also identify several limitations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Abusing Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unifying supervised learning and VAEs -- coverage, systematics and
+  goodness-of-fit in normalizing-flow based neural network models for
+  astro-particle reconstructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2008.05825v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2008.05825v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thorsten Glüsenkamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural-network based predictions of event properties in astro-particle
+physics are getting more and more common. However, in many cases the result is
+just utilized as a point prediction. Statistical uncertainties and coverage
+(1), systematic uncertainties (2) or a goodness-of-fit measure (3) are often
+not calculated. Here we describe a certain choice of training and network
+architecture that allows to incorporate all these properties into a single
+network model. We show that a KL-divergence objective of the joint distribution
+of data and labels allows to unify supervised learning and variational
+autoencoders (VAEs) under one umbrella of stochastic variational inference. The
+unification motivates an extended supervised learning scheme which allows to
+calculate a goodness-of-fit p-value for the neural network model. Conditional
+normalizing flows amortized with a neural network are crucial in this
+construction. We discuss how they allow to rigorously define coverage for
+posteriors defined jointly on a product space, e.g. $\mathbb{R}^n \times
+\mathcal{S}^m$, which encompasses posteriors over directions. Finally,
+systematic uncertainties are naturally included in the variational viewpoint.
+The proposed extended supervised training with amortized normalizing flows
+incorporates (1) coverage calculation, (2) systematics and (3) a
+goodness-of-fit measure in a single machine-learning model. There are no
+constraints on the shape of the involved distributions (e.g. Gaussianity) for
+these properties to hold, in fact it works with complex multi-modal
+distributions defined on product spaces like $\mathbb{R}^n \times
+\mathcal{S}^m$. We see great potential for exploiting this per-event
+information in event selections or for fast astronomical alerts which require
+uncertainty guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimax Estimation of Distances on a Surface and Minimax Manifold
+  Learning in the Isometric-to-Convex Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.12478v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.12478v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ery Arias-Castro, Phong Alain Chau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We start by considering the problem of estimating intrinsic distances on a
+smooth submanifold. We show that minimax optimality can be obtained via a
+reconstruction of the surface, and discuss the use of a particular mesh
+construction -- the tangential Delaunay complex -- for that purpose. We then
+turn to manifold learning and argue that a variant of Isomap where the
+distances are instead computed on a reconstructed surface is minimax optimal
+for the isometric variant of the problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Task Automata for Reinforcement Learning using Hidden Markov
+  Models <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11838v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11838v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Abate, Yousif Almulla, James Fox, David Hyland, Michael Wooldridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training reinforcement learning (RL) agents using scalar reward signals is
+often infeasible when an environment has sparse and non-Markovian rewards.
+Moreover, handcrafting these reward functions before training is prone to
+misspecification, especially when the environment's dynamics are only partially
+known. This paper proposes a novel pipeline for learning non-Markovian task
+specifications as succinct finite-state `task automata' from episodes of agent
+experience within unknown environments. We leverage two key algorithmic
+insights. First, we learn a product MDP, a model composed of the
+specification's automaton and the environment's MDP (both initially unknown),
+by treating the product MDP as a partially observable MDP and using the
+well-known Baum-Welch algorithm for learning hidden Markov models. Second, we
+propose a novel method for distilling the task automaton (assumed to be a
+deterministic finite automaton) from the learnt product MDP. Our learnt task
+automaton enables the decomposition of a task into its constituent sub-tasks,
+which improves the rate at which an RL agent can later synthesise an optimal
+policy. It also provides an interpretable encoding of high-level environmental
+and task features, so a human can readily verify that the agent has learnt
+coherent tasks with no misspecifications. In addition, we take steps towards
+ensuring that the learnt automaton is environment-agnostic, making it
+well-suited for use in transfer learning. Finally, we provide experimental
+results compared with two baselines to illustrate our algorithm's performance
+in different environments and tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures, Accepted to the 26th European Conference on
+  Artificial Intelligence (ECAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Generalization of Training-based Chat<span class="highlight-title">GPT</span> Detection Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01307v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01307v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Xu, Jie Ren, Pengfei He, Shenglai Zeng, Yingqian Cui, Amy Liu, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT is one of the most popular language models which achieve amazing
+performance on various natural language tasks. Consequently, there is also an
+urgent need to detect the texts generated ChatGPT from human written. One of
+the extensively studied methods trains classification models to distinguish
+both. However, existing studies also demonstrate that the trained models may
+suffer from distribution shifts (during test), i.e., they are ineffective to
+predict the generated texts from unseen language tasks or topics. In this work,
+we aim to have a comprehensive investigation on these methods' generalization
+behaviors under distribution shift caused by a wide range of factors, including
+prompts, text lengths, topics, and language tasks. To achieve this goal, we
+first collect a new dataset with human and ChatGPT texts, and then we conduct
+extensive studies on the collected dataset. Our studies unveil insightful
+findings which provide guidance for developing future methodologies or data
+collection strategies for ChatGPT detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expanding Small-Scale <span class="highlight-title">Dataset</span>s with Guided Imagination <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13976v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13976v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Daquan Zhou, Bryan Hooi, Kai Wang, Jiashi Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The power of DNNs relies heavily on the quantity and quality of training
+data. However, collecting and annotating data on a large scale is often
+expensive and time-consuming. To address this issue, we explore a new task,
+termed dataset expansion, aimed at expanding a ready-to-use small dataset by
+automatically creating new labeled samples. To this end, we present a Guided
+Imagination Framework (GIF) that leverages cutting-edge generative models like
+DALL-E2 and Stable Diffusion (SD) to "imagine" and create informative new data
+from the input seed data. Specifically, GIF conducts data imagination by
+optimizing the latent features of the seed data in the semantically meaningful
+space of the prior model, resulting in the creation of photo-realistic images
+with new content. To guide the imagination towards creating informative samples
+for model training, we introduce two key criteria, i.e., class-maintained
+information boosting and sample diversity promotion. These criteria are
+verified to be essential for effective dataset expansion: GIF-SD obtains 13.5%
+higher model accuracy on natural image datasets than unguided expansion with
+SD. With these essential criteria, GIF successfully expands small datasets in
+various scenarios, boosting model accuracy by 36.9% on average over six natural
+image datasets and by 13.5% on average over three medical datasets. The source
+code is available at https://github.com/Vanint/DatasetExpansion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Source code: https://github.com/Vanint/DatasetExpansion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Theoretical Analysis of the Test Error of Finite-Rank Kernel Ridge
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tin Sum Cheng, Aurelien Lucchi, Ivan Dokmanić, Anastasis Kratsios, David Belius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing statistical learning guarantees for general kernel regressors often
+yield loose bounds when used with finite-rank kernels. Yet, finite-rank kernels
+naturally appear in several machine learning problems, e.g.\ when fine-tuning a
+pre-trained deep neural network's last layer to adapt it to a novel task when
+performing transfer learning. We address this gap for finite-rank kernel ridge
+regression (KRR) by deriving sharp non-asymptotic upper and lower bounds for
+the KRR test error of any finite-rank KRR. Our bounds are tighter than
+previously derived bounds on finite-rank KRR, and unlike comparable results,
+they also remain valid for any regularization parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Few-Shot Generalization by Exploring and Exploiting Auxiliary
+  Data <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00674v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00674v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alon Albalak, Colin Raffel, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning is valuable in many real-world applications, but learning a
+generalizable model without overfitting to the few labeled datapoints is
+challenging. In this work, we focus on Few-shot Learning with Auxiliary Data
+(FLAD), a training paradigm that assumes access to auxiliary data during
+few-shot learning in hopes of improving generalization. Previous works have
+proposed automated methods for mixing auxiliary and target data, but these
+methods typically scale linearly (or worse) with the number of auxiliary
+datasets, limiting their practicality. In this work we relate FLAD to the
+explore-exploit dilemma that is central to the multi-armed bandit setting and
+derive algorithms whose computational complexity is independent of the number
+of auxiliary datasets, allowing us to scale to 100x more auxiliary datasets
+than prior methods. We propose two algorithms -- EXP3-FLAD and UCB1-FLAD -- and
+compare them with prior FLAD methods that either explore or exploit, finding
+that the combination of exploration and exploitation is crucial. Through
+extensive experimentation we find that our methods outperform all pre-existing
+FLAD methods by 4% and lead to the first 3 billion parameter language models
+that outperform the 175 billion parameter GPT-3. Overall, our work suggests
+that the discovery of better, more efficient mixing strategies for FLAD may
+provide a viable path towards substantially improving generalization in
+few-shot learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023, 25 pages, 8 figures, code available at
+  https://github.com/alon-albalak/FLAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vocos: Closing the gap between time-domain and Fourier-based neural
+  vocoders for high-quality audio synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00814v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00814v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hubert Siuzdak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in neural vocoding are predominantly driven by Generative
+Adversarial Networks (GANs) operating in the time-domain. While effective, this
+approach neglects the inductive bias offered by time-frequency representations,
+resulting in reduntant and computionally-intensive upsampling operations.
+Fourier-based time-frequency representation is an appealing alternative,
+aligning more accurately with human auditory perception, and benefitting from
+well-established fast algorithms for its computation. Nevertheless, direct
+reconstruction of complex-valued spectrograms has been historically
+problematic, primarily due to phase recovery issues. This study seeks to close
+this gap by presenting Vocos, a new model that directly generates Fourier
+spectral coefficients. Vocos not only matches the state-of-the-art in audio
+quality, as demonstrated in our evaluations, but it also substantially improves
+computational efficiency, achieving an order of magnitude increase in speed
+compared to prevailing time-domain neural vocoding approaches. The source code
+and model weights have been open-sourced at
+https://github.com/charactr-platform/vocos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faster and Accurate Neural Networks with Semantic Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sazzad Sayyed, Jonathan Ashdown, Francesco Restuccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNN) usually come with a significant computational
+burden. While approaches such as structured pruning and mobile-specific DNNs
+have been proposed, they incur drastic accuracy loss. In this paper we leverage
+the intrinsic redundancy in latent representations to reduce the computational
+load with limited loss in performance. We show that semantically similar inputs
+share many filters, especially in the earlier layers. Thus, semantically
+similar classes can be clustered to create cluster-specific subgraphs. To this
+end, we propose a new framework called Semantic Inference (SINF). In short,
+SINF (i) identifies the semantic cluster the object belongs to using a small
+additional classifier and (ii) executes the subgraph extracted from the base
+DNN related to that semantic cluster for inference. To extract each
+cluster-specific subgraph, we propose a new approach named Discriminative
+Capability Score (DCS) that finds the subgraph with the capability to
+discriminate among the members of a specific semantic cluster. DCS is
+independent from SINF and can be applied to any DNN. We benchmark the
+performance of DCS on the VGG16, VGG19, and ResNet50 DNNs trained on the
+CIFAR100 dataset against 6 state-of-the-art pruning approaches. Our results
+show that (i) SINF reduces the inference time of VGG19, VGG16, and ResNet50
+respectively by up to 35%, 29% and 15% with only 0.17%, 3.75%, and 6.75%
+accuracy loss (ii) DCS achieves respectively up to 3.65%, 4.25%, and 2.36%
+better accuracy with VGG16, VGG19, and ResNet50 with respect to existing
+discriminative scores (iii) when used as a pruning criterion, DCS achieves up
+to 8.13% accuracy gain with 5.82% less parameters than the existing state of
+the art work published at ICLR 2023 (iv) when considering per-cluster accuracy,
+SINF performs on average 5.73%, 8.38% and 6.36% better than the base VGG16,
+VGG19, and ResNet50.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures, conference format</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Corruption-Robust Lipschitz Contextual Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13903v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13903v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiliang Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  I study the problem of learning a Lipschitz function with corrupted binary
+signals. The learner tries to learn a $L$-Lipschitz function $f: [0,1]^d
+\rightarrow [0, L]$ that the adversary chooses. There is a total of $T$ rounds.
+In each round $t$, the adversary selects a context vector $x_t$ in the input
+space, and the learner makes a guess to the true function value $f(x_t)$ and
+receives a binary signal indicating whether the guess is high or low. In a
+total of $C$ rounds, the signal may be corrupted, though the value of $C$ is
+\emph{unknown} to the learner. The learner's goal is to incur a small
+cumulative loss. This work introduces the new algorithmic technique
+\emph{agnostic checking} as well as new analysis techniques. I design
+algorithms which: for the symmetric loss, the learner achieves regret $L\cdot
+O(C\log T)$ with $d = 1$ and $L\cdot O_d(C\log T + T^{(d-1)/d})$ with $d > 1$;
+for the pricing loss, the learner achieves regret $L\cdot \widetilde{O}
+(T^{d/(d+1)} + C\cdot T^{1/(d+1)})$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Simple Uniform Sampling Effective for Center-Based Clustering with
+  Outliers: When and Why? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.00558v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.00558v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Huang, Wenjie Liu, Hu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world datasets often contain outliers, and the presence of outliers can
+make the clustering problems to be much more challenging. In this paper, we
+propose a simple uniform sampling framework for solving three representative
+center-based clustering with outliers problems: $k$-center/median/means
+clustering with outliers. Our analysis is fundamentally different from the
+previous (uniform and non-uniform) sampling based ideas. To explain the
+effectiveness of uniform sampling in theory, we introduce a measure of
+"significance" and prove that the performance of our framework depends on the
+significance degree of the given instance. In particular, the sample size can
+be independent of the input data size $n$ and the dimensionality $d$, if we
+assume the given instance is "significant", which is in fact a fairly
+reasonable assumption in practice. Due to its simplicity, the uniform sampling
+approach also enjoys several significant advantages over the non-uniform
+sampling approaches in practice. To the best of our knowledge, this is the
+first work that systematically studies the effectiveness of uniform sampling
+from both theoretical and experimental aspects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1905.10143</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Operator Learning Lessens the Curse of Dimensionality for PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12227v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12227v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Chen, Chunmei Wang, Haizhao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have achieved remarkable success in numerous
+domains, and their application to PDE-related problems has been rapidly
+advancing. This paper provides an estimate for the generalization error of
+learning Lipschitz operators over Banach spaces using DNNs with applications to
+various PDE solution operators. The goal is to specify DNN width, depth, and
+the number of training samples needed to guarantee a certain testing error.
+Under mild assumptions on data distributions or operator structures, our
+analysis shows that deep operator learning can have a relaxed dependence on the
+discretization resolution of PDEs and, hence, lessen the curse of
+dimensionality in many PDE-related problems including elliptic equations,
+parabolic equations, and Burgers equations. Our results are also applied to
+give insights about discretization-invariance in operator learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lumos: Heterogeneity-aware Federated Graph Learning over Decentralized
+  Devices <span class="chip">ICDE 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00492v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00492v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiying Pan, Yifei Zhu, Lingyang Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNN) have been widely deployed in real-world networked
+applications and systems due to their capability to handle graph-structured
+data. However, the growing awareness of data privacy severely challenges the
+traditional centralized model training paradigm, where a server holds all the
+graph information. Federated learning is an emerging collaborative computing
+paradigm that allows model training without data centralization. Existing
+federated GNN studies mainly focus on systems where clients hold distinctive
+graphs or sub-graphs. The practical node-level federated situation, where each
+client is only aware of its direct neighbors, has yet to be studied. In this
+paper, we propose the first federated GNN framework called Lumos that supports
+supervised and unsupervised learning with feature and degree protection on
+node-level federated graphs. We first design a tree constructor to improve the
+representation capability given the limited structural information. We further
+present a Monte Carlo Markov Chain-based algorithm to mitigate the workload
+imbalance caused by degree heterogeneity with theoretically-guaranteed
+performance. Based on the constructed tree for each client, a decentralized
+tree-based GNN trainer is proposed to support versatile training. Extensive
+experiments demonstrate that Lumos outperforms the baseline with significantly
+higher accuracy and greatly reduced communication cost and training time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, published in the Proceedings of the 39th IEEE
+  International Conference on Data Engineering (ICDE 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Correction Strategy for Ranking Distillation in Top-N Recommender
+  System <span class="chip">CIKM 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.03459v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.03459v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngjune Lee, Kee-Eung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation (KD), which transfers the knowledge of a well-trained
+large model (teacher) to a small model (student), has become an important area
+of research for practical deployment of recommender systems. Recently, Relaxed
+Ranking Distillation (RRD) has shown that distilling the ranking information in
+the recommendation list significantly improves the performance. However, the
+method still has limitations in that 1) it does not fully utilize the
+prediction errors of the student model, which makes the training not fully
+efficient, and 2) it only distills the user-side ranking information, which
+provides an insufficient view under the sparse implicit feedback. This paper
+presents Dual Correction strategy for Distillation (DCD), which transfers the
+ranking information from the teacher model to the student model in a more
+efficient manner. Most importantly, DCD uses the discrepancy between the
+teacher model and the student model predictions to decide which knowledge to be
+distilled. By doing so, DCD essentially provides the learning guidance tailored
+to "correcting" what the student model has failed to accurately predict. This
+process is applied for transferring the ranking information from the user-side
+as well as the item-side to address sparse implicit user feedback. Our
+experiments show that the proposed method outperforms the state-of-the-art
+baselines, and ablation studies validate the effectiveness of each component.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Contrastive Patch-Based Subspace Learning for Camera Image Signal
+  Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.00253v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.00253v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunhao Yang, Yi Wang, Chandrajit Bajaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera Image Signal Processing (ISP) pipelines can get appealing results in
+different image signal processing tasks. Nonetheless, the majority of these
+methods, including those employing an encoder-decoder deep architecture for the
+task, typically utilize a uniform filter applied consistently across the entire
+image. However, it is natural to view a camera image as heterogeneous, as the
+color intensity and the artificial noise are distributed vastly differently,
+even across the two-dimensional domain of a single image. Varied Moire ringing,
+motion blur, color-bleaching, or lens-based projection distortions can all
+potentially lead to a heterogeneous image artifact filtering problem. In this
+paper, we present a specific patch-based, local subspace deep neural network
+that improves Camera ISP to be robust to heterogeneous artifacts (especially
+image denoising). We call our three-fold deep-trained model the Patch Subspace
+Learning Autoencoder (PSL-AE). The PSL-AE model does not make assumptions
+regarding uniform levels of image distortion. Instead, it first encodes patches
+extracted from noisy a nd clean image pairs, with different artifact types or
+distortion levels, by contrastive learning. Then, the patches of each image are
+encoded into corresponding soft clusters within their suitable latent
+sub-space, utilizing a prior mixture model. Furthermore, the decoders undergo
+training in an unsupervised manner, specifically trained for the image patches
+present in each cluster. The experiments highlight the adaptability and
+efficacy through enhanced heterogeneous filtering, both from synthesized
+artifacts but also realistic SIDD image pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world
+  APIs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, Sihan Zhao, Lauren Hong, Runchu Tian, Ruobing Xie, Jie Zhou, Mark Gerstein, Dahai Li, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advancements of open-source large language models (LLMs), e.g.,
+LLaMA, they remain significantly limited in tool-use capabilities, i.e., using
+external tools (APIs) to fulfill human instructions. The reason is that current
+instruction tuning largely focuses on basic language tasks but ignores the
+tool-use domain. This is in contrast to the excellent tool-use capabilities of
+state-of-the-art (SOTA) closed-source LLMs, e.g., ChatGPT. To bridge this gap,
+we introduce ToolLLM, a general tool-use framework encompassing data
+construction, model training, and evaluation. We first present ToolBench, an
+instruction-tuning dataset for tool use, which is constructed automatically
+using ChatGPT. Specifically, the construction can be divided into three stages:
+(i) API collection: we collect 16,464 real-world RESTful APIs spanning 49
+categories from RapidAPI Hub; (ii) instruction generation: we prompt ChatGPT to
+generate diverse instructions involving these APIs, covering both single-tool
+and multi-tool scenarios; (iii) solution path annotation: we use ChatGPT to
+search for a valid solution path (chain of API calls) for each instruction. To
+enhance the reasoning capabilities of LLMs, we develop a novel depth-first
+search-based decision tree algorithm. It enables LLMs to evaluate multiple
+reasoning traces and expand the search space. Moreover, to evaluate the
+tool-use capabilities of LLMs, we develop an automatic evaluator: ToolEval.
+Based on ToolBench, we fine-tune LLaMA to obtain an LLM ToolLLaMA, and equip it
+with a neural API retriever to recommend appropriate APIs for each instruction.
+Experiments show that ToolLLaMA demonstrates a remarkable ability to execute
+complex instructions and generalize to unseen APIs, and exhibits comparable
+performance to ChatGPT. Our ToolLLaMA also demonstrates strong zero-shot
+generalization ability in an out-of-distribution tool-use dataset: APIBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Unlikelihood of D-Separation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itai Feigenbaum, Huan Wang, Shelby Heinecke, Juan Carlos Niebles, Weiran Yao, Caiming Xiong, Devansh Arpit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal discovery aims to recover a causal graph from data generated by it;
+constraint based methods do so by searching for a d-separating conditioning set
+of nodes in the graph via an oracle. In this paper, we provide analytic
+evidence that on large graphs, d-separation is a rare phenomenon, even when
+guaranteed to exist, unless the graph is extremely sparse. We then provide an
+analytic average case analysis of the PC Algorithm for causal discovery, as
+well as a variant of the SGS Algorithm we call UniformSGS. We consider a set
+$V=\{v_1,\ldots,v_n\}$ of nodes, and generate a random DAG $G=(V,E)$ where
+$(v_a, v_b) \in E$ with i.i.d. probability $p_1$ if $a<b$ and $0$ if $a > b$.
+We provide upper bounds on the probability that a subset of $V-\{x,y\}$
+d-separates $x$ and $y$, conditional on $x$ and $y$ being d-separable; our
+upper bounds decay exponentially fast to $0$ as $|V| \rightarrow \infty$. For
+the PC Algorithm, while it is known that its worst-case guarantees fail on
+non-sparse graphs, we show that the same is true for the average case, and that
+the sparsity requirement is quite demanding: for good performance, the density
+must go to $0$ as $|V| \rightarrow \infty$ even in the average case. For
+UniformSGS, while it is known that the running time is exponential for existing
+edges, we show that in the average case, that is the expected running time for
+most non-existing edges as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transforming <span class="highlight-title">Transformer</span>s for Resilient Lifelong Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08250v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08250v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay Savadikar, Michelle Dai, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifelong learning without catastrophic forgetting (i.e., resiliency) remains
+an open problem for deep neural networks. The prior art mostly focuses on
+convolutional neural networks. With the increasing dominance of Transformers in
+deep learning, it is a pressing need to study lifelong learning with
+Transformers. Due to the complexity of training Transformers in practice, for
+lifelong learning, a question naturally arises: Can Transformers be learned to
+grow in a task aware way, that is to be dynamically transformed by introducing
+lightweight learnable plastic components to the architecture, while retaining
+the parameter-heavy, but stable components at streaming tasks? To that end,
+motivated by the lifelong learning capability maintained by the functionality
+of Hippocampi in human brain, we explore what would be, and how to implement,
+Artificial Hippocampi (ArtiHippo) in Transformers. We present a method to
+identify, and learn to grow, ArtiHippo in Vision Transformers (ViTs) for
+resilient lifelong learning in four aspects: (i) Where to place ArtiHippo to
+enable plasticity while preserving the core function of ViTs at streaming
+tasks? (ii) How to represent and realize ArtiHippo to ensure expressivity and
+adaptivity for tackling tasks of different nature in lifelong learning? (iii)
+How to learn to grow ArtiHippo to exploit task synergies (i.e., the learned
+knowledge) and overcome catastrophic forgetting? (iv) How to harness the best
+of our proposed ArtiHippo and prompting-based approaches? In experiments, we
+test the proposed method on the challenging Visual Domain Decathlon (VDD)
+benchmark and the 5-Dataset benchmark under the task-incremental lifelong
+learning setting. It obtains consistently better performance than the prior art
+with sensible ArtiHippo learned continually. To our knowledge, it is the first
+attempt of lifelong learning with ViTs on the challenging VDD benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Reinforcement Learning for Combinatorial Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2008.12248v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2008.12248v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunhao Yang, Andrew Whinston
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper gives a detailed review of reinforcement learning (RL) in
+combinatorial optimization, introduces the history of combinatorial
+optimization starting in the 1950s, and compares it with the RL algorithms of
+recent years. This paper explicitly looks at a famous combinatorial
+problem-traveling salesperson problem (TSP). It compares the approach of modern
+RL algorithms for the TSP with an approach published in the 1970s. By comparing
+the similarities and variances between these methodologies, the paper
+demonstrates how RL algorithms are optimized due to the evolution of machine
+learning techniques and computing power. The paper then briefly introduces the
+deep learning approach to the TSP named deep RL, which is an extension of the
+traditional mathematical framework. In deep RL, attention and feature encoding
+mechanisms are introduced to generate near-optimal solutions. The survey shows
+that integrating the deep learning mechanism, such as attention with RL, can
+effectively approximate the TSP. The paper also argues that deep learning could
+be a generic approach that can be integrated with any traditional RL algorithm
+to enhance the outcomes of the TSP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ONNXExplainer: an ONNX Based Generic Framework to Explain Neural
+  Networks Using Shapley Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Zhao, Runxin He, Nicholas Kersting, Can Liu, Shubham Agrawal, Chiranjeet Chetia, Yu Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding why a neural network model makes certain decisions can be as
+important as the inference performance. Various methods have been proposed to
+help practitioners explain the prediction of a neural network model, of which
+Shapley values are most popular. SHAP package is a leading implementation of
+Shapley values to explain neural networks implemented in TensorFlow or PyTorch
+but lacks cross-platform support, one-shot deployment and is highly
+inefficient. To address these problems, we present the ONNXExplainer, which is
+a generic framework to explain neural networks using Shapley values in the ONNX
+ecosystem. In ONNXExplainer, we develop its own automatic differentiation and
+optimization approach, which not only enables One-Shot Deployment of neural
+networks inference and explanations, but also significantly improves the
+efficiency to compute explanation with less memory consumption. For fair
+comparison purposes, we also implement the same optimization in TensorFlow and
+PyTorch and measure its performance against the current state of the art
+open-source counterpart, SHAP. Extensive benchmarks demonstrate that the
+proposed optimization approach improves the explanation latency of VGG19,
+ResNet50, DenseNet201, and EfficientNetB0 by as much as 500%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locally Invariant Explanations: Towards Stable and Unidirectional
+  Explanations through Local Invariant Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.12143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.12143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Dhurandhar, Karthikeyan Ramamurthy, Kartik Ahuja, Vijay Arya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Locally interpretable model agnostic explanations (LIME) method is one of the
+most popular methods used to explain black-box models at a per example level.
+Although many variants have been proposed, few provide a simple way to produce
+high fidelity explanations that are also stable and intuitive. In this work, we
+provide a novel perspective by proposing a model agnostic local explanation
+method inspired by the invariant risk minimization (IRM) principle --
+originally proposed for (global) out-of-distribution generalization -- to
+provide such high fidelity explanations that are also stable and unidirectional
+across nearby examples. Our method is based on a game theoretic formulation
+where we theoretically show that our approach has a strong tendency to
+eliminate features where the gradient of the black-box function abruptly
+changes sign in the locality of the example we want to explain, while in other
+cases it is more careful and will choose a more conservative (feature)
+attribution, a behavior which can be highly desirable for recourse.
+Empirically, we show on tabular, image and text data that the quality of our
+explanations with neighborhoods formed using random perturbations are much
+better than LIME and in some cases even comparable to other methods that use
+realistic neighbors sampled from the data manifold. This is desirable given
+that learning a manifold to either create realistic neighbors or to project
+explanations is typically expensive or may even be impossible. Moreover, our
+algorithm is simple and efficient to train, and can ascertain stable input
+features for local decisions of a black-box without access to side information
+such as a (partial) causal graph as has been seen in some recent works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Probabilistic Stability of Stochastic Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Ziyin, Botao Li, Tomer Galanti, Masahito Ueda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Characterizing and understanding the stability of Stochastic Gradient Descent
+(SGD) remains an open problem in deep learning. A common method is to utilize
+the convergence of statistical moments, esp. the variance, of the parameters to
+quantify the stability. We revisit the definition of stability for SGD and
+propose using the $\textit{convergence in probability}$ condition to define the
+$\textit{probabilistic stability}$ of SGD. The probabilistic stability sheds
+light on a fundamental question in deep learning theory: how SGD selects a
+meaningful solution for a neural network from an enormous number of possible
+solutions that may severely overfit. We show that only through the lens of
+probabilistic stability does SGD exhibit rich and practically relevant phases
+of learning, such as the phases of the complete loss of stability, incorrect
+learning where the model captures incorrect data correlation, convergence to
+low-rank saddles, and correct learning where the model captures the correct
+correlation. These phase boundaries are precisely quantified by the Lyapunov
+exponents of the dynamics. The obtained phase diagrams imply that SGD prefers
+low-rank saddles in a neural network when the underlying gradient is noisy,
+thereby influencing the learning performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint with revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Infinite-Dimensional Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakiw Pidstrigach, Youssef Marzouk, Sebastian Reich, Sven Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have had a profound impact on many application areas,
+including those where data are intrinsically infinite-dimensional, such as
+images or time series. The standard approach is first to discretize and then to
+apply diffusion models to the discretized data. While such approaches are
+practically appealing, the performance of the resulting algorithms typically
+deteriorates as discretization parameters are refined. In this paper, we
+instead directly formulate diffusion-based generative models in infinite
+dimensions and apply them to the generative modeling of functions. We prove
+that our formulations are well posed in the infinite-dimensional setting and
+provide dimension-independent distance bounds from the sample to the target
+measure. Using our theory, we also develop guidelines for the design of
+infinite-dimensional diffusion models. For image distributions, these
+guidelines are in line with the canonical choices currently made for diffusion
+models. For other distributions, however, we can improve upon these canonical
+choices, which we show both theoretically and empirically, by applying the
+algorithms to data distributions on manifolds and inspired by Bayesian inverse
+problems or simulation-based inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Certifiers Make Neural Networks Vulnerable to Availability Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.11299v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.11299v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Lorenz, Marta Kwiatkowska, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve reliable, robust, and safe AI systems, it is vital to implement
+fallback strategies when AI predictions cannot be trusted. Certifiers for
+neural networks are a reliable way to check the robustness of these
+predictions. They guarantee for some predictions that a certain class of
+manipulations or attacks could not have changed the outcome. For the remaining
+predictions without guarantees, the method abstains from making a prediction,
+and a fallback strategy needs to be invoked, which typically incurs additional
+costs, can require a human operator, or even fail to provide any prediction.
+While this is a key concept towards safe and secure AI, we show for the first
+time that this approach comes with its own security risks, as such fallback
+strategies can be deliberately triggered by an adversary. In addition to
+naturally occurring abstains for some inputs and perturbations, the adversary
+can use training-time attacks to deliberately trigger the fallback with high
+probability. This transfers the main system load onto the fallback, reducing
+the overall system's integrity and/or availability. We design two novel
+availability attacks, which show the practical relevance of these threats. For
+example, adding 1% poisoned data during training is sufficient to trigger the
+fallback and hence make the model unavailable for up to 100% of all inputs by
+inserting the trigger. Our extensive experiments across multiple datasets,
+model architectures, and certifiers demonstrate the broad applicability of
+these attacks. An initial investigation into potential defenses shows that
+current approaches are insufficient to mitigate the issue, highlighting the
+need for new, specific solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at 16th ACM Workshop on Artificial Intelligence and
+  Security (AISec '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoRAPrune: Pruning Meets Low-Rank Parameter-Efficient Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18403v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18403v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Zhang, Hao Chen, Chunhua Shen, Zhen Yang, Linlin Ou, Xinyi Yu, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained models (LPMs), such as LLaMA and GLM, have shown
+exceptional performance across various tasks through fine-tuning. Although
+low-rank adaption (LoRA) has emerged to cheaply fine-tune these LPMs on
+downstream tasks, their deployment is still hindered by the vast model scale
+and computational costs. Neural network pruning offers a way to compress LPMs.
+However, the current pruning methods designed for LPMs are not compatible with
+LoRA. This is due to their utilization of unstructured pruning on LPMs,
+impeding the merging of LoRA weights, or their dependence on the gradients of
+pre-trained weights to guide pruning, which can impose significant memory
+overhead. To this end, we propose LoRAPrune, a new framework that delivers an
+accurate, compact model for efficient inference in a highly memory-effective
+manner. Specifically, we first design a LoRA-guided pruning criterion, which
+uses the weights and gradients of LoRA, rather than the gradients of
+pre-trained weights for importance estimation. We then propose a structured
+iterative pruning procedure, to remove redundant channels and heads. Extensive
+experimental results demonstrate the superior performance of our LoRAPrune over
+existing approaches on the LLaMA series models. For instance, at a 50\%
+compression rate, LoRAPrune outperforms LLM-Pruner by a perplexity reduction of
+8.0 on WikiText2 and 16.05 on PTB datasets, while concurrently reducing memory
+usage by 52.6\%. The code will be released after review
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Path Structured Multimarginal Schrödinger Bridge for Probabilistic
+  Learning of Hardware Resource Usage by Control Software 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00604v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00604v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgiy A. Bondar, Robert Gifford, Linh Thi Xuan Phan, Abhishek Halder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The solution of the path structured multimarginal Schr\"{o}dinger bridge
+problem (MSBP) is the most-likely measure-valued trajectory consistent with a
+sequence of observed probability measures or distributional snapshots. We
+leverage recent algorithmic advances in solving such structured MSBPs for
+learning stochastic hardware resource usage by control software. The solution
+enables predicting the time-varying distribution of hardware resource
+availability at a desired time with guaranteed linear convergence. We
+demonstrate the efficacy of our probabilistic learning approach in a model
+predictive control software execution case study. The method exhibits rapid
+convergence to an accurate prediction of hardware resource utilization of the
+controller. The method can be broadly applied to any software to predict
+cyber-physical context-dependent performance at arbitrary time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures. Submitted to American Control Conference (ACC)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallelizing non-linear sequential models over the sequence length 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12252v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12252v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Heng Lim, Qi Zhu, Joshua Selfridge, Muhammad Firmansyah Kasim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential models, such as Recurrent Neural Networks and Neural Ordinary
+Differential Equations, have long suffered from slow training due to their
+inherent sequential nature. For many years this bottleneck has persisted, as
+many thought sequential models could not be parallelized. We challenge this
+long-held belief with our parallel algorithm that accelerates GPU evaluation of
+sequential models by up to 3 orders of magnitude faster without compromising
+output accuracy. The algorithm does not need any special structure in the
+sequential models' architecture, making it applicable to a wide range of
+architectures. Using our method, training sequential models can be more than 10
+times faster than the common sequential method without any meaningful
+difference in the training results. Leveraging this accelerated training, we
+discovered the efficacy of the Gated Recurrent Unit in a long time series
+classification problem with 17k time samples. By overcoming the training
+bottleneck, our work serves as the first step to unlock the potential of
+non-linear sequential models for long sequence problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferring Learning Trajectories of Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14122v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14122v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daiki Chijiwa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks (DNNs) is computationally expensive, which is
+problematic especially when performing duplicated or similar training runs in
+model ensemble or fine-tuning pre-trained models, for example. Once we have
+trained one DNN on some dataset, we have its learning trajectory (i.e., a
+sequence of intermediate parameters during training) which may potentially
+contain useful information for learning the dataset. However, there has been no
+attempt to utilize such information of a given learning trajectory for another
+training. In this paper, we formulate the problem of "transferring" a given
+learning trajectory from one initial parameter to another one (learning
+transfer problem) and derive the first algorithm to approximately solve it by
+matching gradients successively along the trajectory via permutation symmetry.
+We empirically show that the transferred parameters achieve non-trivial
+accuracy before any direct training, and can be trained significantly faster
+than training from scratch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: updates include theoretical analysis and additional experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning k-Level Sparse Neural Networks Using a New Generalized Weighted
+  Group Sparse Envelope Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12921v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12921v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yehonathan Refael, Iftach Arbel, Wasim Huleihel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an efficient method to learn both unstructured and structured
+sparse neural networks during training, utilizing a novel generalization of the
+sparse envelope function (SEF) used as a regularizer, termed {\itshape{weighted
+group sparse envelope function}} (WGSEF). The WGSEF acts as a neuron group
+selector, which is leveraged to induce structured sparsity. The method ensures
+a hardware-friendly structured sparsity of a deep neural network (DNN) to
+efficiently accelerate the DNN's evaluation. Notably, the method is adaptable,
+letting any hardware specify group definitions, such as filters, channels,
+filter shapes, layer depths, a single parameter (unstructured), etc. Owing to
+the WGSEF's properties, the proposed method allows to a pre-define sparsity
+level that would be achieved at the training convergence, while maintaining
+negligible network accuracy degradation or even improvement in the case of
+redundant parameters. We introduce an efficient technique to calculate the
+exact value of the WGSEF along with its proximal operator in a worst-case
+complexity of $O(n)$, where $n$ is the total number of group variables. In
+addition, we propose a proximal-gradient-based optimization method to train the
+model, that is, the non-convex minimization of the sum of the neural network
+loss and the WGSEF. Finally, we conduct an experiment and illustrate the
+efficiency of our proposed technique in terms of the completion ratio,
+accuracy, and inference latency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memory Population in Continual Learning via Outlier Elimination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.01145v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.01145v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio Hurtado, Alain Raymond-Saez, Vladimir Araujo, Vincenzo Lomonaco, Alvaro Soto, Davide Bacciu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Catastrophic forgetting, the phenomenon of forgetting previously learned
+tasks when learning a new one, is a major hurdle in developing continual
+learning algorithms. A popular method to alleviate forgetting is to use a
+memory buffer, which stores a subset of previously learned task examples for
+use during training on new tasks. The de facto method of filling memory is by
+randomly selecting previous examples. However, this process could introduce
+outliers or noisy samples that could hurt the generalization of the model. This
+paper introduces Memory Outlier Elimination (MOE), a method for identifying and
+eliminating outliers in the memory buffer by choosing samples from
+label-homogeneous subpopulations. We show that a space with a high homogeneity
+is related to a feature space that is more representative of the class
+distribution. In practice, MOE removes a sample if it is surrounded by samples
+from different labels. We demonstrate the effectiveness of MOE on CIFAR-10,
+CIFAR-100, and CORe50, outperforming previous well-known memory population
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Meta-Learning Based Precoder Optimization Framework for Rate-Splitting
+  Multiple Access 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08822v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08822v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Cerna Loli, Bruno Clerckx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this letter, we propose the use of a meta-learning based precoder
+optimization framework to directly optimize the Rate-Splitting Multiple Access
+(RSMA) precoders with partial Channel State Information at the Transmitter
+(CSIT). By exploiting the overfitting of the compact neural network to maximize
+the explicit Average Sum-Rate (ASR) expression, we effectively bypass the need
+for any other training data while minimizing the total running time. Numerical
+results reveal that the meta-learning based solution achieves similar ASR
+performance to conventional precoder optimization in medium-scale scenarios,
+and significantly outperforms sub-optimal low complexity precoder algorithms in
+the large-scale regime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation Metrics for DNNs Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10616v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10616v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abanoub Ghobrial, Samuel Budgett, Dieter Balemans, Hamid Asgari, Phil Reiter, Kerstin Eder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a lot of ongoing research effort into developing different
+techniques for neural networks compression. However, the community lacks
+standardised evaluation metrics, which are key to identifying the most suitable
+compression technique for different applications. This paper reviews existing
+neural network compression evaluation metrics and implements them into a
+standardisation framework called NetZIP. We introduce two novel metrics to
+cover existing gaps of evaluation in the literature: 1) Compression and
+Hardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success
+(OCS). We demonstrate the use of NetZIP using two case studies on two different
+hardware platforms (a PC and a Raspberry Pi 4) focusing on object
+classification and object detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Accelerating Diffusion-Based Sampling Process via Improved
+  Integration Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11328v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11328v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqiang Zhang, Niwa Kenta, W. Bastiaan Kleijn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A popular approach to sample a diffusion-based generative model is to solve
+an ordinary differential equation (ODE). In existing samplers, the coefficients
+of the ODE solvers are pre-determined by the ODE formulation, the reverse
+discrete timesteps, and the employed ODE methods. In this paper, we consider
+accelerating several popular ODE-based sampling processes (including EDM, DDIM,
+and DPM-Solver) by optimizing certain coefficients via improved integration
+approximation (IIA). We propose to minimize, for each time step, a mean squared
+error (MSE) function with respect to the selected coefficients. The MSE is
+constructed by applying the original ODE solver for a set of fine-grained
+timesteps, which in principle provides a more accurate integration
+approximation in predicting the next diffusion state. The proposed IIA
+technique does not require any change of a pre-trained model, and only
+introduces a very small computational overhead for solving a number of
+quadratic optimization problems. Extensive experiments show that considerably
+better FID scores can be achieved by using IIA-EDM, IIA-DDIM, and
+IIA-DPM-Solver than the original counterparts when the neural function
+evaluation (NFE) is small (i.e., less than 25).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Isomorphic Networks for Assessing Reliability of the
+  Medium-Voltage Grid 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01181v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01181v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlotte Cambier van Nooten, Tom van de Poll, Sonja Füllhase, Jacco Heres, Tom Heskes, Yuliya Shapovalova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring electricity grid reliability becomes increasingly challenging with
+the shift towards renewable energy and declining conventional capacities.
+Distribution System Operators (DSOs) aim to achieve grid reliability by
+verifying the n-1 principle, ensuring continuous operation in case of component
+failure. Electricity networks' complex graph-based data holds crucial
+information for n-1 assessment: graph structure and data about stations/cables.
+Unlike traditional machine learning methods, Graph Neural Networks (GNNs)
+directly handle graph-structured data. This paper proposes using Graph
+Isomorphic Networks (GINs) for n-1 assessments in medium voltage grids. The GIN
+framework is designed to generalise to unseen grids and utilise graph structure
+and data about stations/cables. The proposed GIN approach demonstrates faster
+and more reliable grid assessments than a traditional mathematical optimisation
+approach, reducing prediction times by approximately a factor of 1000. The
+findings offer a promising approach to address computational challenges and
+enhance the reliability and efficiency of energy grid assessments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causality-Aided Trade-off Analysis for Machine Learning Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13057v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13057v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenlan Ji, Pingchuan Ma, Shuai Wang, Yanhui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been an increasing interest in enhancing the fairness of machine
+learning (ML). Despite the growing number of fairness-improving methods, we
+lack a systematic understanding of the trade-offs among factors considered in
+the ML pipeline when fairness-improving methods are applied. This understanding
+is essential for developers to make informed decisions regarding the provision
+of fair ML services. Nonetheless, it is extremely difficult to analyze the
+trade-offs when there are multiple fairness parameters and other crucial
+metrics involved, coupled, and even in conflict with one another.
+  This paper uses causality analysis as a principled method for analyzing
+trade-offs between fairness parameters and other crucial metrics in ML
+pipelines. To ractically and effectively conduct causality analysis, we propose
+a set of domain-specific optimizations to facilitate accurate causal discovery
+and a unified, novel interface for trade-off analysis based on well-established
+causal inference methods. We conduct a comprehensive empirical study using
+three real-world datasets on a collection of widelyused fairness-improving
+techniques. Our study obtains actionable suggestions for users and developers
+of fair ML. We further demonstrate the versatile usage of our approach in
+selecting the optimal fairness-improving method, paving the way for more
+ethical and socially responsible AI technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling MLPs: A Tale of Inductive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13575v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13575v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregor Bachmann, Sotiris Anagnostidis, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we revisit the most fundamental building block in deep learning,
+the multi-layer perceptron (MLP), and study the limits of its performance on
+vision tasks. Empirical insights into MLPs are important for multiple reasons.
+(1) Given the recent narrative "less inductive bias is better", popularized due
+to transformers eclipsing convolutional models, it is natural to explore the
+limits of this hypothesis. To that end, MLPs offer an ideal test bed, as they
+lack any vision-specific inductive bias. (2) MLPs have almost exclusively been
+the main protagonist in the deep learning theory literature due to their
+mathematical simplicity, serving as a proxy to explain empirical phenomena
+observed for more complex architectures. Surprisingly, experimental datapoints
+for MLPs are very difficult to find in the literature, especially when coupled
+with large pre-training protocols. This discrepancy between practice and theory
+is worrying: Do MLPs reflect the empirical advances exhibited by practical
+models? Or do theorists need to rethink the role of MLPs as a proxy? We provide
+insights into both these aspects. We show that the performance of MLPs
+drastically improves with scale (95% on CIFAR10, 82% on CIFAR100, 58% on
+ImageNet ReaL), highlighting that lack of inductive bias can indeed be
+compensated. We observe that MLPs mimic the behaviour of their modern
+counterparts faithfully, with some components in the learning setting however
+exhibiting stronger or unexpected behaviours. Due to their inherent
+computational efficiency, large pre-training experiments become more accessible
+for academic researchers. All of our experiments were run on a single GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Physical Latent Spaces for High-Resolution Flow Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11298v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11298v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chloe Paliard, Nils Thuerey, Kiwon Um
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore training deep neural network models in conjunction with physics
+simulations via partial differential equations (PDEs), using the simulated
+degrees of freedom as latent space for a neural network. In contrast to
+previous work, this paper treats the degrees of freedom of the simulated space
+purely as tools to be used by the neural network. We demonstrate this concept
+for learning reduced representations, as it is extremely challenging to
+faithfully preserve correct solutions over long time-spans with traditional
+reduced representations, particularly for solutions with large amounts of small
+scale features. This work focuses on the use of such physical, reduced latent
+space for the restoration of fine simulations, by training models that can
+modify the content of the reduced physical states as much as needed to best
+satisfy the learning objective. This autonomy allows the neural networks to
+discover alternate dynamics that significantly improve the performance in the
+given tasks. We demonstrate this concept for various fluid flows ranging from
+different turbulence scenarios to rising smoke plumes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEPT: Towards Efficient Scene Representation Learning for Motion
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqian Lan, Yuxuan Jiang, Yao Mu, Chen Chen, Shengbo Eben Li, Hang Zhao, Keqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion prediction is crucial for autonomous vehicles to operate safely in
+complex traffic environments. Extracting effective spatiotemporal relationships
+among traffic elements is key to accurate forecasting. Inspired by the
+successful practice of pretrained large language models, this paper presents
+SEPT, a modeling framework that leverages self-supervised learning to develop
+powerful spatiotemporal understanding for complex traffic scenes. Specifically,
+our approach involves three masking-reconstruction modeling tasks on scene
+inputs including agents' trajectories and road network, pretraining the scene
+encoder to capture kinematics within trajectory, spatial structure of road
+network, and interactions among roads and agents. The pretrained encoder is
+then finetuned on the downstream forecasting task. Extensive experiments
+demonstrate that SEPT, without elaborate architectural design or manual feature
+engineering, achieves state-of-the-art performance on the Argoverse 1 and
+Argoverse 2 motion forecasting benchmarks, outperforming previous methods on
+all main metrics by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Engineering: A Top-Down Approach to AI Transparency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Long Phan, Sarah Chen, James Campbell, Phillip Guo, Richard Ren, Alexander Pan, Xuwang Yin, Mantas Mazeika, Ann-Kathrin Dombrowski, Shashwat Goel, Nathaniel Li, Michael J. Byun, Zifan Wang, Alex Mallen, Steven Basart, Sanmi Koyejo, Dawn Song, Matt Fredrikson, J. Zico Kolter, Dan Hendrycks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we identify and characterize the emerging area of
+representation engineering (RepE), an approach to enhancing the transparency of
+AI systems that draws on insights from cognitive neuroscience. RepE places
+population-level representations, rather than neurons or circuits, at the
+center of analysis, equipping us with novel methods for monitoring and
+manipulating high-level cognitive phenomena in deep neural networks (DNNs). We
+provide baselines and an initial analysis of RepE techniques, showing that they
+offer simple yet effective solutions for improving our understanding and
+control of large language models. We showcase how these methods can provide
+traction on a wide range of safety-relevant problems, including honesty,
+harmlessness, power-seeking, and more, demonstrating the promise of top-down
+transparency research. We hope that this work catalyzes further exploration of
+RepE and fosters advancements in the transparency and safety of AI systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at
+  https://github.com/andyzoujm/representation-engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics of specialization in neural modules under resource constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.02626v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.02626v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Béna, Dan F. M. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has long been believed that the brain is highly modular both in terms of
+structure and function, although recent evidence has led some to question the
+extent of both types of modularity. We used artificial neural networks to test
+the hypothesis that structural modularity is sufficient to guarantee functional
+specialization, and find that in general, this doesn't necessarily hold except
+at extreme levels. We then systematically tested which features of the
+environment and network do lead to the emergence of specialization. We used a
+simple toy environment, task and network, allowing us precise control, and show
+that in this setup, several distinct measures of specialization give
+qualitatively similar results. We further find that (1) specialization can only
+emerge in environments where features of that environment are meaningfully
+separable, (2) specialization preferentially emerges when the network is
+strongly resource-constrained, and (3) these findings are qualitatively similar
+across different network architectures, but the quantitative relationships
+depends on the architecture type. Finally, we show that functional
+specialization varies dynamically across time, and demonstrate that these
+dynamics depend on both the timing and bandwidth of information flow in the
+network. We conclude that a static notion of specialization, based on
+structural modularity, is likely too simple a framework for understanding
+intelligence in situations of real-world complexity, from biology to
+brain-inspired neuromorphic systems. We propose that thoroughly stress testing
+candidate definitions of functional modularity in simplified scenarios before
+extending to more complex data, network models and electrophysiological
+recordings is likely to be a fruitful approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning Learns Label Relationships but Is Not Conventional
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Yarin Gal, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The predictions of Large Language Models (LLMs) on downstream tasks often
+improve significantly when including examples of the input--label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works. For example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we provide novel insights into how ICL leverages label information,
+revealing both capabilities and limitations. To ensure we obtain a
+comprehensive picture of ICL behavior, we study probabilistic aspects of ICL
+predictions and thoroughly examine the dynamics of ICL as more examples are
+provided. Our experiments show that ICL predictions almost always depend on
+in-context labels, and that ICL can learn truly novel tasks in-context.
+However, we also find that ICL struggles to fully overcome prediction
+preferences acquired from pre-training data, and, further, that ICL does not
+consider all in-context information equally.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image Clustering via the Principle of Rate Reduction in the Age of
+  <span class="highlight-title">Pretrain</span>ed Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianzhe Chu, Shengbang Tong, Tianjiao Ding, Xili Dai, Benjamin David Haeffele, René Vidal, Yi Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large pre-trained models has brought about a paradigm shift in
+both visual representation learning and natural language processing. However,
+clustering unlabeled images, as a fundamental and classic machine learning
+problem, still lacks an effective solution, particularly for large-scale
+datasets. In this paper, we propose a novel image clustering pipeline that
+leverages the powerful feature representation of large pre-trained models such
+as CLIP and cluster images effectively and efficiently at scale. We first
+developed a novel algorithm to estimate the number of clusters in a given
+dataset. We then show that the pre-trained features are significantly more
+structured by further optimizing the rate reduction objective. The resulting
+features may significantly improve the clustering accuracy, e.g., from 57\% to
+66\% on ImageNet-1k. Furthermore, by leveraging CLIP's multimodality bridge
+between image and text, we develop a simple yet effective self-labeling
+algorithm that produces meaningful text labels for the clusters. Through
+extensive experiments, we show that our pipeline works well on standard
+datasets such as CIFAR-10, CIFAR-100, and ImageNet-1k. It also extends to
+datasets without predefined labels, such as LAION-Aesthetics and WikiArts. We
+released the code in https://github.com/LeslieTrue/CPP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Personalized Federated Learning with Shared and Personalized
+  Uncertainty Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15499v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15499v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Chen, Hengyu Liu, Longbing Cao, Tiancheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian personalized federated learning (BPFL) addresses challenges in
+existing personalized FL (PFL). BPFL aims to quantify the uncertainty and
+heterogeneity within and across clients towards uncertainty representations by
+addressing the statistical heterogeneity of client data. In PFL, some recent
+preliminary work proposes to decompose hidden neural representations into
+shared and local components and demonstrates interesting results. However, most
+of them do not address client uncertainty and heterogeneity in FL systems,
+while appropriately decoupling neural representations is challenging and often
+ad hoc. In this paper, we make the first attempt to introduce a general BPFL
+framework to decompose and jointly learn shared and personalized uncertainty
+representations on statistically heterogeneous client data over time. A
+Bayesian federated neural network BPFed instantiates BPFL by jointly learning
+cross-client shared uncertainty and client-specific personalized uncertainty
+over statistically heterogeneous and randomly participating clients. We further
+involve continual updating of prior distribution in BPFed to speed up the
+convergence and avoid catastrophic forgetting. Theoretical analysis and
+guarantees are provided in addition to the experimental evaluation of BPFed
+against the diversified baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Maintaining Plasticity in Continual Learning via Regenerative
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11958v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11958v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurabh Kumar, Henrik Marklund, Benjamin Van Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, plasticity refers to the ability of an agent to
+quickly adapt to new information. Neural networks are known to lose plasticity
+when processing non-stationary data streams. In this paper, we propose L2 Init,
+a simple approach for maintaining plasticity by incorporating in the loss
+function L2 regularization toward initial parameters. This is very similar to
+standard L2 regularization (L2), the only difference being that L2 regularizes
+toward the origin. L2 Init is simple to implement and requires selecting only a
+single hyper-parameter. The motivation for this method is the same as that of
+methods that reset neurons or parameter values. Intuitively, when recent losses
+are insensitive to particular parameters, these parameters should drift toward
+their initial values. This prepares parameters to adapt quickly to new tasks.
+On problems representative of different types of nonstationarity in continual
+supervised learning, we demonstrate that L2 Init most consistently mitigates
+plasticity loss compared to previously proposed approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Scene Text Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00410v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00410v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hayato Mitani, Akisato Kimura, Seiichi Uchida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene text removal (STR) is the image transformation task to remove text
+regions in scene images. The conventional STR methods remove all scene text.
+This means that the existing methods cannot select text to be removed. In this
+paper, we propose a novel task setting named selective scene text removal
+(SSTR) that removes only target words specified by the user. Although SSTR is a
+more complex task than STR, the proposed multi-module structure enables
+efficient training for SSTR. Experimental results show that the proposed method
+can remove target words as expected.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, Accepted at the 34th British Machine Vision
+  Conference, code:https://github.com/mitanihayato/Selective-Scene-Text-Removal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Search-Space Generation Neural Architecture Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Chen, Luming Liang, Tianyu Ding, Ilya Zharkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To search an optimal sub-network within a general deep neural network (DNN),
+existing neural architecture search (NAS) methods typically rely on
+handcrafting a search space beforehand. Such requirements make it challenging
+to extend them onto general scenarios without significant human expertise and
+manual intervention. To overcome the limitations, we propose Automated
+Search-Space Generation Neural Architecture Search (ASGNAS), perhaps the first
+automated system to train general DNNs that cover all candidate connections and
+operations and produce high-performing sub-networks in the one shot manner.
+Technologically, ASGNAS delivers three noticeable contributions to minimize
+human efforts: (i) automated search space generation for general DNNs; (ii) a
+Hierarchical Half-Space Projected Gradient (H2SPG) that leverages the hierarchy
+and dependency within generated search space to ensure the network validity
+during optimization, and reliably produces a solution with both high
+performance and hierarchical group sparsity; and (iii) automated sub-network
+construction upon the H2SPG solution. Numerically, we demonstrate the
+effectiveness of ASGNAS on a variety of general DNNs, including RegNet,
+StackedUnets, SuperResNet, and DARTS, over benchmark datasets such as CIFAR10,
+Fashion-MNIST, ImageNet, STL-10 , and SVNH. The sub-networks computed by ASGNAS
+achieve competitive even superior performance compared to the starting full
+DNNs and other state-of-the-arts. The library will be released at
+https://github.com/tianyic/only_train_once.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Graph visualization for DARTS, SuperResNet are omitted for arXiv
+  version due to exceeding page dimension limit. Please refer to the
+  open-review version for taking the visualizations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Race Detection Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Chen, Xianzhong Ding, Murali Emani, Tristan Vanderbruggen, Pei-hung Lin, Chuanhua Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are demonstrating significant promise as an
+alternate strategy to facilitate analyses and optimizations of high-performance
+computing programs, circumventing the need for resource-intensive manual tool
+creation. In this paper, we explore a novel LLM-based data race detection
+approach combining prompting engineering and fine-tuning techniques. We create
+a dedicated dataset named DRB-ML, which is derived from DataRaceBench, with
+fine-grain labels showing the presence of data race pairs and their associated
+variables, line numbers, and read/write information. DRB-ML is then used to
+evaluate representative LLMs and fine-tune open-source ones. Our experiment
+shows that LLMs can be a viable approach to data race detection. However, they
+still cannot compete with traditional data race detection tools when we need
+detailed information about variable pairs causing data races.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Knowledge Distillation for Auto-regressive Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, Olivier Bachem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) is widely used for compressing a teacher model to
+reduce its inference cost and memory footprint, by training a smaller student
+model. However, current KD methods for auto-regressive sequence models suffer
+from distribution mismatch between output sequences seen during training and
+those generated by the student during inference. To address this issue, we
+introduce Generalized Knowledge Distillation (GKD). Instead of solely relying
+on a fixed set of output sequences, GKD trains the student on its
+self-generated output sequences by leveraging feedback from the teacher on such
+sequences. Unlike supervised KD approaches, GKD also offers the flexibility to
+employ alternative loss functions between the student and teacher, which can be
+useful when the student lacks the expressivity to mimic the teacher's
+distribution. Furthermore, GKD facilitates the seamless integration of
+distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for
+distilling auto-regressive T5 language models on summarization, translation,
+and arithmetic reasoning tasks as well as task-agnostic instruction tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally. Added new results and
+  experiment details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedLPA: Personalized One-shot Federated Learning with Layer-Wise
+  Posterior Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00339v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00339v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Liu, Liangxi Liu, Feiyang Ye, Yunheng Shen, Xia Li, Linshan Jiang, Jialin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently aggregating trained neural networks from local clients into a
+global model on a server is a widely researched topic in federated learning.
+Recently, motivated by diminishing privacy concerns, mitigating potential
+attacks, and reducing the overhead of communication, one-shot federated
+learning (i.e., limiting client-server communication into a single round) has
+gained popularity among researchers. However, the one-shot aggregation
+performances are sensitively affected by the non-identical training data
+distribution, which exhibits high statistical heterogeneity in some real-world
+scenarios. To address this issue, we propose a novel one-shot aggregation
+method with Layer-wise Posterior Aggregation, named FedLPA. FedLPA aggregates
+local models to obtain a more accurate global model without requiring extra
+auxiliary datasets or exposing any confidential local information, e.g., label
+distributions. To effectively capture the statistics maintained in the biased
+local datasets in the practical non-IID scenario, we efficiently infer the
+posteriors of each layer in each local model using layer-wise Laplace
+approximation and aggregate them to train the global parameters. Extensive
+experimental results demonstrate that FedLPA significantly improves learning
+performance over state-of-the-art methods across several metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain of Hindsight Aligns Language Models with Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02676v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02676v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Carmelo Sferrazza, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from human preferences is important for language models to match
+human needs and to align with human and social values. Prior works have
+achieved remarkable successes by learning from human feedback to understand and
+follow instructions. Nonetheless, these methods are either founded on
+hand-picked model generations that are favored by human annotators, rendering
+them inefficient in terms of data utilization and challenging to apply in
+general, or they depend on reinforcement learning, which often suffers from
+imperfect reward functions and relies on extremely challenging optimizations.
+In this work, we propose a novel technique, Chain of Hindsight, that is easy to
+optimize and can learn from any form of feedback, regardless of its polarity.
+Our idea is inspired by how humans learn from extensive feedback presented in
+the form of languages. We convert all types of feedback into sequences of
+sentences, which are then used to fine-tune the model, allowing us to take
+advantage of the language comprehension capabilities of language models. We
+condition the model on a sequence of model generations paired with feedback. By
+doing so, the model is trained to generate outputs based on feedback, while
+learning to identify and correct negative attributes or errors. Applying our
+method to large language models, we observed that Chain of Hindsight
+significantly surpasses previous methods in aligning language models with human
+preferences. We report significant improvements on summarization and dialogue
+benchmarks, with our approach markedly preferred in human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asymmetric Momentum: A Rethinking of Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gongyue Zhang, Dinghuang Zhang, Shuwen Zhao, Donghan Liu, Carrie M. Toptan, Honghai Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Through theoretical and experimental validation, unlike all existing adaptive
+methods like Adam which penalize frequently-changing parameters and are only
+applicable to sparse gradients, we propose the simplest SGD enhanced method,
+Loss-Controlled Asymmetric Momentum(LCAM). By averaging the loss, we divide
+training process into different loss phases and using different momentum. It
+not only can accelerates slow-changing parameters for sparse gradients, similar
+to adaptive optimizers, but also can choose to accelerates frequently-changing
+parameters for non-sparse gradients, thus being adaptable to all types of
+datasets. We reinterpret the machine learning training process through the
+concepts of weight coupling and weight traction, and experimentally validate
+that weights have directional specificity, which are correlated with the
+specificity of the dataset. Thus interestingly, we observe that in non-sparse
+gradients, frequently-changing parameters should actually be accelerated, which
+is completely opposite to traditional adaptive perspectives. Compared to
+traditional SGD with momentum, this algorithm separates the weights without
+additional computational costs. It is noteworthy that this method relies on the
+network's ability to extract complex features. We primarily use Wide Residual
+Networks for our research, employing the classic datasets Cifar10 and Cifar100
+to test the ability for feature separation and conclude phenomena that are much
+more important than just accuracy rates. Finally, compared to classic SGD
+tuning methods, while using WRN on these two datasets and with nearly half the
+training epochs, we achieve equal or better test accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Koopman-based generalization bound: New aspect for full-rank weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05825v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05825v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuka Hashimoto, Sho Sonoda, Isao Ishikawa, Atsushi Nitanda, Taiji Suzuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new bound for generalization of neural networks using Koopman
+operators. Whereas most of existing works focus on low-rank weight matrices, we
+focus on full-rank weight matrices. Our bound is tighter than existing
+norm-based bounds when the condition numbers of weight matrices are small.
+Especially, it is completely independent of the width of the network if the
+weight matrices are orthogonal. Our bound does not contradict to the existing
+bounds but is a complement to the existing bounds. As supported by several
+existing empirical results, low-rankness is not the only reason for
+generalization. Furthermore, our bound can be combined with the existing bounds
+to obtain a tighter bound. Our result sheds new light on understanding
+generalization of neural networks with full-rank weight matrices, and it
+provides a connection between operator-theoretic analysis and generalization of
+neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through
+  Unexploitable Data with Learnable Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09241v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09241v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wan Jiang, Yunfeng Diao, He Wang, Jianxin Sun, Meng Wang, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safeguarding data from unauthorized exploitation is vital for privacy and
+security, especially in recent rampant research in security breach such as
+adversarial/membership attacks. To this end, \textit{unlearnable examples}
+(UEs) have been recently proposed as a compelling protection, by adding
+imperceptible perturbation to data so that models trained on them cannot
+classify them accurately on original clean distribution. Unfortunately, we find
+UEs provide a false sense of security, because they cannot stop unauthorized
+users from utilizing other unprotected data to remove the protection, by
+turning unlearnable data into learnable again. Motivated by this observation,
+we formally define a new threat by introducing \textit{learnable unauthorized
+examples} (LEs) which are UEs with their protection removed. The core of this
+approach is a novel purification process that projects UEs onto the manifold of
+LEs. This is realized by a new joint-conditional diffusion model which denoises
+UEs conditioned on the pixel and perceptual similarity between UEs and LEs.
+Extensive experiments demonstrate that LE delivers state-of-the-art countering
+performance against both supervised UEs and unsupervised UEs in various
+scenarios, which is the first generalizable countermeasure to UEs across
+supervised learning and unsupervised learning. Our code is available at
+\url{https://github.com/jiangw-0/LE_JCDP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JoMA: Demystifying Multilayer <span class="highlight-title">Transformer</span>s via JOint Dynamics of MLP and
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuandong Tian, Yiping Wang, Zhenyu Zhang, Beidi Chen, Simon Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Joint MLP/Attention (JoMA) dynamics, a novel mathematical
+framework to understand the training procedure of multilayer Transformer
+architectures. This is achieved by integrating out the self-attention layer in
+Transformers, producing a modified dynamics of MLP layers only. JoMA removes
+unrealistic assumptions in previous analysis (e.g., lack of residual
+connection) and predicts that the attention first becomes sparse (to learn
+salient tokens), then dense (to learn less salient tokens) in the presence of
+nonlinear activations, while in the linear case, it is consistent with existing
+works that show attention becomes sparse over time. We leverage JoMA to
+qualitatively explains how tokens are combined to form hierarchies in
+multilayer Transformers, when the input tokens are generated by a latent
+hierarchical generative model. Experiments on models trained from real-world
+dataset (Wikitext2/Wikitext103) and various pre-trained models (OPT, Pythia)
+verify our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Stability of Iterative Retraining of Generative Models on their
+  own Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00429v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00429v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quentin Bertrand, Avishek Joey Bose, Alexandre Duplessis, Marco Jiralerspong, Gauthier Gidel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have made tremendous progress in modeling complex
+data, often exhibiting generation quality that surpasses a typical human's
+ability to discern the authenticity of samples. Undeniably, a key driver of
+this success is enabled by the massive amounts of web-scale data consumed by
+these models. Due to these models' striking performance and ease of
+availability, the web will inevitably be increasingly populated with synthetic
+content. Such a fact directly implies that future iterations of generative
+models must contend with the reality that their training is curated from both
+clean data and artificially generated data from past models. In this paper, we
+develop a framework to rigorously study the impact of training generative
+models on mixed datasets (of real and synthetic data) on their stability. We
+first prove the stability of iterative training under the condition that the
+initial generative models approximate the data distribution well enough and the
+proportion of clean training data (w.r.t. synthetic data) is large enough. We
+empirically validate our theory on both synthetic and natural images by
+iteratively training normalizing flows and state-of-the-art diffusion models on
+CIFAR10 and FFHQ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Multimedia Verification with Computational Tools and OSINT:
+  Russia-Ukraine Conflict Case Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sohail Ahmed Khan, Jan Gunnar Furuly, Henrik Brattli Vold, Rano Tahseen, Duc-Tien Dang-Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the use of computational tools and Open-Source
+Intelligence (OSINT) techniques for verifying online multimedia content, with a
+specific focus on real-world cases from the Russia-Ukraine conflict. Over a
+nine-month period from April to December 2022, we examine verification
+workflows, tools, and case studies published by \faktiskbar. Our study
+showcases the effectiveness of diverse resources, including AI tools,
+geolocation tools, internet archives, and social media monitoring platforms, in
+enabling journalists and fact-checkers to efficiently process and corroborate
+evidence, ensuring the dissemination of accurate information. This research
+underscores the vital role of computational tools and OSINT techniques in
+promoting evidence-based reporting and combatting misinformation. We also touch
+on the current limitations of available tools and prospects for future
+developments in multimedia verification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OneAdapt: Fast Adaptation for Deep Learning Applications via
+  Backpropagation <span class="chip">SoCC' 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuntai Du, Yuhan Liu, Yitian Hao, Qizheng Zhang, Haodong Wang, Yuyang Huang, Ganesh Ananthanarayanan, Junchen Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning inference on streaming media data, such as object detection in
+video or LiDAR feeds and text extraction from audio waves, is now ubiquitous.
+To achieve high inference accuracy, these applications typically require
+significant network bandwidth to gather high-fidelity data and extensive GPU
+resources to run deep neural networks (DNNs). While the high demand for network
+bandwidth and GPU resources could be substantially reduced by optimally
+adapting the configuration knobs, such as video resolution and frame rate,
+current adaptation techniques fail to meet three requirements simultaneously:
+adapt configurations (i) with minimum extra GPU or bandwidth overhead; (ii) to
+reach near-optimal decisions based on how the data affects the final DNN's
+accuracy, and (iii) do so for a range of configuration knobs. This paper
+presents OneAdapt, which meets these requirements by leveraging a
+gradient-ascent strategy to adapt configuration knobs. The key idea is to
+embrace DNNs' differentiability to quickly estimate the accuracy's gradient to
+each configuration knob, called AccGrad. Specifically, OneAdapt estimates
+AccGrad by multiplying two gradients: InputGrad (i.e. how each configuration
+knob affects the input to the DNN) and DNNGrad (i.e. how the DNN input affects
+the DNN inference output). We evaluate OneAdapt across five types of
+configurations, four analytic tasks, and five types of input data. Compared to
+state-of-the-art adaptation schemes, OneAdapt cuts bandwidth usage and GPU
+usage by 15-59% while maintaining comparable accuracy or improves accuracy by
+1-5% while using equal or fewer resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SoCC' 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Music Generation based on Generative Adversarial Networks with
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09075v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09075v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Jiang, Ruoxue Wu, Zhenghan Chen, Xiaoxuan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive models based on Transformers have become the prevailing
+approach for generating music compositions that exhibit comprehensive musical
+structure. These models are typically trained by minimizing the negative
+log-likelihood (NLL) of the observed sequence in an autoregressive manner.
+However, when generating long sequences, the quality of samples from these
+models tends to significantly deteriorate due to exposure bias. To address this
+issue, we leverage classifiers trained to differentiate between real and
+sampled sequences to identify these failures. This observation motivates our
+exploration of adversarial losses as a complement to the NLL objective. We
+employ a pre-trained Span-BERT model as the discriminator in the Generative
+Adversarial Network (GAN) framework, which enhances training stability in our
+experiments. To optimize discrete sequences within the GAN framework, we
+utilize the Gumbel-Softmax trick to obtain a differentiable approximation of
+the sampling process. Additionally, we partition the sequences into smaller
+chunks to ensure that memory constraints are met. Through human evaluations and
+the introduction of a novel discriminative metric, we demonstrate that our
+approach outperforms a baseline model trained solely on likelihood
+maximization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The results exist serious factual error</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Large-scale <span class="highlight-title">Dataset</span> for Audio-Language Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11500v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11500v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luoyi Sun, Xuenan Xu, Mengyue Wu, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The AI community has made significant strides in developing powerful
+foundation models, driven by large-scale multimodal datasets. However, in the
+audio representation learning community, the present audio-language datasets
+suffer from limitations such as insufficient volume, simplistic content, and
+arduous collection procedures. To tackle these challenges, we present an
+innovative and automatic audio caption generation pipeline based on a series of
+public tools or APIs, and construct a large-scale, high-quality, audio-language
+dataset, named as Auto-ACD, comprising over 1.9M audio-text pairs. To
+demonstrate the effectiveness of the proposed dataset, we train popular models
+on our dataset and show performance improvement on various downstream tasks,
+namely, audio-language retrieval, audio captioning, environment classification.
+In addition, we establish a novel test set and provide a benchmark for
+audio-text tasks. The proposed dataset will be released at
+https://auto-acd.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-02T00:00:00Z">2023-10-02</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">80</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Closing the Curious Case of Neural Text Degeneration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Finlayson, John Hewitt, Alexander Koller, Swabha Swayamdipta, Ashish Sabharwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their ubiquity in language generation, it remains unknown why
+truncation sampling heuristics like nucleus sampling are so effective. We
+provide a theoretical explanation for the effectiveness of the truncation
+sampling by proving that truncation methods that discard tokens below some
+probability threshold (the most common type of truncation) can guarantee that
+all sampled tokens have nonzero true probability. However, thresholds are a
+coarse heuristic, and necessarily discard some tokens with nonzero true
+probability as well. In pursuit of a more precise sampling strategy, we show
+that we can leverage a known source of model errors, the softmax bottleneck, to
+prove that certain tokens have nonzero true probability, without relying on a
+threshold. Based on our findings, we develop an experimental truncation
+strategy and the present pilot studies demonstrating the promise of this type
+of algorithm. Our evaluations show that our method outperforms its
+threshold-based counterparts under automatic and human evaluation metrics for
+low-entropy (i.e., close to greedy) open-ended text generation. Our theoretical
+findings and pilot experiments provide both insight into why truncation
+sampling works, and make progress toward more expressive sampling algorithms
+that better surface the generative capabilities of large language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Continuous <span class="highlight-title">Prompt</span> Transfer: Generalizing Task Semantics Across
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijun Wu, Yongkang Wu, Lili Mou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning in natural language processing (NLP) has become an increasingly
+popular method for adapting large language models to specific tasks. However,
+the transferability of these prompts, especially continuous prompts, between
+different models remains a challenge. In this work, we propose a zero-shot
+continuous prompt transfer method, where source prompts are encoded into
+relative space and the corresponding target prompts are searched for
+transferring to target models. Experimental results confirm the effectiveness
+of our method, showing that 'task semantics' in continuous prompts can be
+generalized across various language models. Moreover, we find that combining
+'task semantics' from multiple source models can further enhance the
+generalizability of transfer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One model to rule them all ? Towards End-to-End Joint Speaker
+  Diarization and Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuele Cornell, Jee-weon Jung, Shinji Watanabe, Stefano Squartini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel framework for joint speaker diarization (SD) and
+automatic speech recognition (ASR), named SLIDAR (sliding-window
+diarization-augmented recognition). SLIDAR can process arbitrary length inputs
+and can handle any number of speakers, effectively solving ``who spoke what,
+when'' concurrently. SLIDAR leverages a sliding window approach and consists of
+an end-to-end diarization-augmented speech transcription (E2E DAST) model which
+provides, locally, for each window: transcripts, diarization and speaker
+embeddings. The E2E DAST model is based on an encoder-decoder architecture and
+leverages recent techniques such as serialized output training and
+``Whisper-style" prompting. The local outputs are then combined to get the
+final SD+ASR result by clustering the speaker embeddings to get global speaker
+identities. Experiments performed on monaural recordings from the AMI corpus
+confirm the effectiveness of the method in both close-talk and far-field speech
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VAL: Interactive Task Learning with <span class="highlight-title">GPT</span> Dialog Parsing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lane Lawley, Christopher J. MacLellan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning often requires millions of examples to produce static,
+black-box models. In contrast, interactive task learning (ITL) emphasizes
+incremental knowledge acquisition from limited instruction provided by humans
+in modalities such as natural language. However, in practice, ITL systems often
+suffers from brittle, error-prone language parsing. Large language models
+(LLMs) are resistant to brittleness but are not interpretable and cannot learn
+incrementally. We present VAL, an ITL system with a new philosophy for
+LLM/symbolic integration. By using LLMs only for specific tasks -- such as
+predicate and argument selection -- within an algorithmic framework, VAL reaps
+the benefits of LLMs to support interactive learning of hierarchical task
+knowledge from natural language. Acquired knowledge is human interpretable and
+generalizes to support execution of novel tasks without additional training. We
+studied users' interactions with VAL in a video game setting, finding that most
+users could successfully teach VAL using language they felt was natural.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Review</span> of Digital Learning Environments for Teaching Natural Language
+  Processing in K-12 Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyi Tian, Kristy Elizabeth Boyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Processing (NLP) plays a significant role in our daily lives
+and has become an essential part of Artificial Intelligence (AI) education in
+K-12. As children grow up with NLP-powered applications, it is crucial to
+introduce NLP concepts to them, fostering their understanding of language
+processing, language generation, and ethical implications of AI and NLP. This
+paper presents a comprehensive review of digital learning environments for
+teaching NLP in K-12. Specifically, it explores existing digital learning
+tools, discusses how they support specific NLP tasks and procedures, and
+investigates their explainability and evaluation results in educational
+contexts. By examining the strengths and limitations of these tools, this
+literature review sheds light on the current state of NLP learning tools in
+K-12 education. It aims to guide future research efforts to refine existing
+tools, develop new ones, and explore more effective and inclusive strategies
+for integrating NLP into K-12 educational contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Against Authorship Identification Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haining Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Authorship identification has proven unsettlingly effective in inferring the
+identity of the author of an unsigned document, even when sensitive personal
+information has been carefully omitted. In the digital era, individuals leave a
+lasting digital footprint through their written content, whether it is posted
+on social media, stored on their employer's computers, or located elsewhere.
+When individuals need to communicate publicly yet wish to remain anonymous,
+there is little available to protect them from unwanted authorship
+identification. This unprecedented threat to privacy is evident in scenarios
+such as whistle-blowing. Proposed defenses against authorship identification
+attacks primarily aim to obfuscate one's writing style, thereby making it
+unlinkable to their pre-existing writing, while concurrently preserving the
+original meaning and grammatical integrity. The presented work offers a
+comprehensive review of the advancements in this research area spanning over
+the past two decades and beyond. It emphasizes the methodological frameworks of
+modification and generation-based strategies devised to evade authorship
+identification attacks, highlighting joint efforts from the differential
+privacy community. Limitations of current research are discussed, with a
+spotlight on open challenges and potential research avenues.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Retrieval-Augmented Language Models Robust to Irrelevant Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ori Yoran, Tomer Wolfson, Ori Ram, Jonathan Berant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented language models (RALMs) hold promise to produce language
+understanding systems that are are factual, efficient, and up-to-date. An
+important desideratum of RALMs, is that retrieved information helps model
+performance when it is relevant, and does not harm performance when it is not.
+This is particularly important in multi-hop reasoning scenarios, where misuse
+of irrelevant evidence can lead to cascading errors. However, recent work has
+shown that retrieval augmentation can sometimes have a negative effect on
+performance. In this work, we present a thorough analysis on five open-domain
+question answering benchmarks, characterizing cases when retrieval reduces
+accuracy. We then propose two methods to mitigate this issue. First, a simple
+baseline that filters out retrieved passages that do not entail question-answer
+pairs according to a natural language inference (NLI) model. This is effective
+in preventing performance reduction, but at a cost of also discarding relevant
+passages. Thus, we propose a method for automatically generating data to
+fine-tune the language model to properly leverage retrieved passages, using a
+mix of relevant and irrelevant contexts at training time. We empirically show
+that even 1,000 examples suffice to train the model to be robust to irrelevant
+contexts while maintaining high performance on examples with relevant ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">GPT</span>-Driver: Learning to Drive with <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Mao, Yuxi Qian, Hang Zhao, Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple yet effective approach that can transform the OpenAI
+GPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion
+planning is a core challenge in autonomous driving, aiming to plan a driving
+trajectory that is safe and comfortable. Existing motion planners predominantly
+leverage heuristic methods to forecast driving trajectories, yet these
+approaches demonstrate insufficient generalization capabilities in the face of
+novel and unseen driving scenarios. In this paper, we propose a novel approach
+to motion planning that capitalizes on the strong reasoning capabilities and
+generalization potential inherent to Large Language Models (LLMs). The
+fundamental insight of our approach is the reformulation of motion planning as
+a language modeling problem, a perspective not previously explored.
+Specifically, we represent the planner inputs and outputs as language tokens,
+and leverage the LLM to generate driving trajectories through a language
+description of coordinate positions. Furthermore, we propose a novel
+prompting-reasoning-finetuning strategy to stimulate the numerical reasoning
+potential of the LLM. With this strategy, the LLM can describe highly precise
+trajectory coordinates and also its internal decision-making process in natural
+language. We evaluate our approach on the large-scale nuScenes dataset, and
+extensive experiments substantiate the effectiveness, generalization ability,
+and interpretability of our GPT-based motion planner. Code will be released
+upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ It's MBR All the Way Down: Modern Generation Techniques Through the Lens
+  of Minimum Bayes Risk 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amanda Bertsch, Alex Xie, Graham Neubig, Matthew R. Gormley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Minimum Bayes Risk (MBR) decoding is a method for choosing the outputs of a
+machine learning system based not on the output with the highest probability,
+but the output with the lowest risk (expected error) among multiple candidates.
+It is a simple but powerful method: for an additional cost at inference time,
+MBR provides reliable several-point improvements across metrics for a wide
+variety of tasks without any additional data or training. Despite this, MBR is
+not frequently applied in NLP works, and knowledge of the method itself is
+limited. We first provide an introduction to the method and the recent
+literature. We show that several recent methods that do not reference MBR can
+be written as special cases of MBR; this reformulation provides additional
+theoretical justification for the performance of these methods, explaining some
+results that were previously only empirical. We provide theoretical and
+empirical results about the effectiveness of various MBR variants and make
+concrete recommendations for the application of MBR in NLP models, including
+future directions in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who is Chat<span class="highlight-title">GPT</span>? Benchmarking LLMs' Psychological Portrayal Using
+  PsychoBench 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jen-tse Huang, Wenxuan Wang, Eric John Li, Man Ho Lam, Shujie Ren, Youliang Yuan, Wenxiang Jiao, Zhaopeng Tu, Michael R. Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently showcased their remarkable
+capacities, not only in natural language processing tasks but also across
+diverse domains such as clinical medicine, legal consultation, and education.
+LLMs become more than mere applications, evolving into assistants capable of
+addressing diverse user requests. This narrows the distinction between human
+beings and artificial intelligence agents, raising intriguing questions
+regarding the potential manifestation of personalities, temperaments, and
+emotions within LLMs. In this paper, we propose a framework, PsychoBench, for
+evaluating diverse psychological aspects of LLMs. Comprising thirteen scales
+commonly used in clinical psychology, PsychoBench further classifies these
+scales into four distinct categories: personality traits, interpersonal
+relationships, motivational tests, and emotional abilities. Our study examines
+five popular models, namely \texttt{text-davinci-003}, ChatGPT, GPT-4,
+LLaMA-2-7b, and LLaMA-2-13b. Additionally, we employ a jailbreak approach to
+bypass the safety alignment protocols and test the intrinsic natures of LLMs.
+We have made PsychoBench openly accessible via
+\url{https://github.com/CUHK-ARISE/PsychoBench}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compressing LLMs: The Truth is Rarely Pure and Never Simple 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Jaiswal, Zhe Gan, Xianzhi Du, Bowen Zhang, Zhangyang Wang, Yinfei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their remarkable achievements, modern Large Language Models (LLMs)
+encounter exorbitant computational and memory footprints. Recently, several
+works have shown significant success in training-free and data-free compression
+(pruning and quantization) of LLMs achieving 50-60% sparsity and reducing the
+bit-width down to 3 or 4 bits per weight, with negligible perplexity
+degradation over the uncompressed baseline. As recent research efforts are
+focused on developing increasingly sophisticated compression methods, our work
+takes a step back, and re-evaluates the effectiveness of existing SoTA
+compression methods, which rely on a fairly simple and widely questioned
+metric, perplexity (even for dense LLMs). We introduce Knowledge-Intensive
+Compressed LLM BenchmarK (LLM-KICK), a collection of carefully-curated tasks to
+re-define the evaluation protocol for compressed LLMs, which have significant
+alignment with their dense counterparts, and perplexity fail to capture subtle
+change in their true capabilities. LLM-KICK unveils many favorable merits and
+unfortunate plights of current SoTA compression methods: all pruning methods
+suffer significant performance degradation, sometimes at trivial sparsity
+ratios (e.g., 25-30%), and fail for N:M sparsity on knowledge-intensive tasks;
+current quantization methods are more successful than pruning; yet, pruned LLMs
+even at $\geq 50$% sparsity are robust in-context retrieval and summarization
+systems; among others. LLM-KICK is designed to holistically access compressed
+LLMs' ability for language understanding, reasoning, generation, in-context
+retrieval, in-context summarization, etc. We hope our study can foster the
+development of better LLM compression methods. All our related codes are planed
+to be open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffAR: Denoising Diffusion Autoregressive Model for Raw Speech Waveform
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Benita, Michael Elad, Joseph Keshet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently been shown to be relevant for high-quality
+speech generation. Most work has been focused on generating spectrograms, and
+as such, they further require a subsequent model to convert the spectrogram to
+a waveform (i.e., a vocoder). This work proposes a diffusion probabilistic
+end-to-end model for generating a raw speech waveform. The proposed model is
+autoregressive, generating overlapping frames sequentially, where each frame is
+conditioned on a portion of the previously generated one. Hence, our model can
+effectively synthesize an unlimited speech duration while preserving
+high-fidelity synthesis and temporal coherence. We implemented the proposed
+model for unconditional and conditional speech generation, where the latter can
+be driven by an input sequence of phonemes, amplitudes, and pitch values.
+Working on the waveform directly has some empirical advantages. Specifically,
+it allows the creation of local acoustic behaviors, like vocal fry, which makes
+the overall waveform sounds more natural. Furthermore, the proposed diffusion
+model is stochastic and not deterministic; therefore, each inference generates
+a slightly different waveform variation, enabling abundance of valid
+realizations. Experiments show that the proposed model generates speech with
+superior quality compared with other state-of-the-art neural speech generation
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UltraFeedback: Boosting Language Models with High-quality Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganqu Cui, Lifan Yuan, Ning Ding, Guanming Yao, Wei Zhu, Yuan Ni, Guotong Xie, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback (RLHF) has become a pivot
+technique in aligning large language models (LLMs) with human preferences. In
+RLHF practice, preference data plays a crucial role in bridging human
+proclivity and LLMs. However, the scarcity of diverse, naturalistic datasets of
+human preferences on LLM outputs at scale poses a great challenge to RLHF as
+well as feedback learning research within the open-source community. Current
+preference datasets, either proprietary or limited in size and prompt variety,
+result in limited RLHF adoption in open-source models and hinder further
+exploration. In this study, we propose ULTRAFEEDBACK, a large-scale,
+high-quality, and diversified preference dataset designed to overcome these
+limitations and foster RLHF development. To create ULTRAFEEDBACK, we compile a
+diverse array of instructions and models from multiple sources to produce
+comparative data. We meticulously devise annotation instructions and employ
+GPT-4 to offer detailed feedback in both numerical and textual forms.
+ULTRAFEEDBACK establishes a reproducible and expandable preference data
+construction pipeline, serving as a solid foundation for future RLHF and
+feedback learning research. Utilizing ULTRAFEEDBACK, we train various models to
+demonstrate its effectiveness, including the reward model UltraRM, chat
+language model UltraLM-13B-PPO, and critique model UltraCM. Experimental
+results indicate that our models outperform existing open-source models,
+achieving top performance across multiple benchmarks. Our data and models are
+available at https://github.com/thunlp/UltraFeedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenSim: Generating Robotic Simulation Tasks via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lirui Wang, Yiyang Ling, Zhecheng Yuan, Mohit Shridhar, Chen Bao, Yuzhe Qin, Bailin Wang, Huazhe Xu, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting large amounts of real-world interaction data to train general
+robotic policies is often prohibitively expensive, thus motivating the use of
+simulation data. However, existing methods for data generation have generally
+focused on scene-level diversity (e.g., object instances and poses) rather than
+task-level diversity, due to the human effort required to come up with and
+verify novel tasks. This has made it challenging for policies trained on
+simulation data to demonstrate significant task-level generalization. In this
+paper, we propose to automatically generate rich simulation environments and
+expert demonstrations by exploiting a large language models' (LLM) grounding
+and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed
+generation, wherein a target task is given to the LLM and the LLM proposes a
+task curriculum to solve the target task, and exploratory generation, wherein
+the LLM bootstraps from previous tasks and iteratively proposes novel tasks
+that would be helpful in solving more complex tasks. We use GPT4 to expand the
+existing benchmark by ten times to over 100 tasks, on which we conduct
+supervised finetuning and evaluate several LLMs including finetuned GPTs and
+Code Llama on code generation for robotic simulation tasks. Furthermore, we
+observe that LLMs-generated simulation programs can enhance task-level
+generalization significantly when used for multitask policy training. We
+further find that with minimal sim-to-real adaptation, the multitask policies
+pretrained on GPT4-generated simulation tasks exhibit stronger transfer to
+unseen long-horizon tasks in the real world and outperform baselines by 25%.
+See the project website (https://liruiw.github.io/gensim) for code, demos, and
+videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See our project website (https://liruiw.github.io/gensim), demo
+  (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code
+  (https://github.com/liruiw/GenSim) for visualizations and open-source models
+  and datasets</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RA-DIT: Retrieval-Augmented Dual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Victoria Lin, Xilun Chen, Mingda Chen, Weijia Shi, Maria Lomeli, Rich James, Pedro Rodriguez, Jacob Kahn, Gergely Szilvasy, Mike Lewis, Luke Zettlemoyer, Scott Yih
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented language models (RALMs) improve performance by accessing
+long-tail and up-to-date knowledge from external data stores, but are
+challenging to build. Existing approaches require either expensive
+retrieval-specific modifications to LM pre-training or use post-hoc integration
+of the data store that leads to suboptimal performance. We introduce
+Retrieval-Augmented Dual Instruction Tuning (RA-DIT), a lightweight fine-tuning
+methodology that provides a third option by retrofitting any LLM with retrieval
+capabilities. Our approach operates in two distinct fine-tuning steps: (1) one
+updates a pre-trained LM to better use retrieved information, while (2) the
+other updates the retriever to return more relevant results, as preferred by
+the LM. By fine-tuning over tasks that require both knowledge utilization and
+contextual awareness, we demonstrate that each stage yields significant
+performance improvements, and using both leads to additional gains. Our best
+model, RA-DIT 65B, achieves state-of-the-art performance across a range of
+knowledge-intensive zero- and few-shot learning benchmarks, significantly
+outperforming existing in-context RALM approaches by up to +8.9% in 0-shot
+setting and +1.4% in 5-shot setting on average.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Dialogue Management: Quality <span class="highlight-title">Dataset</span>s vs Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Ángel Medina-Ramírez, Cayetano Guerra-Artal, Mario Hernández-Tejera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented dialogue systems (TODS) have become crucial for users to
+interact with machines and computers using natural language. One of its key
+components is the dialogue manager, which guides the conversation towards a
+good goal for the user by providing the best possible response. Previous works
+have proposed rule-based systems (RBS), reinforcement learning (RL), and
+supervised learning (SL) as solutions for the correct dialogue management; in
+other words, select the best response given input by the user. However, this
+work argues that the leading cause of DMs not achieving maximum performance
+resides in the quality of the datasets rather than the models employed thus
+far; this means that dataset errors, like mislabeling, originate a large
+percentage of failures in dialogue management. We studied the main errors in
+the most widely used datasets, Multiwoz 2.1 and SGD, to demonstrate this
+hypothesis. To do this, we have designed a synthetic dialogue generator to
+fully control the amount and type of errors introduced in the dataset. Using
+this generator, we demonstrated that errors in the datasets contribute
+proportionally to the performance of the models
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Lies: Hallucinations are not Bugs, but Features as Adversarial
+  Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Yu Yao, Kun-Peng Ning, Zhen-Hui Liu, Mu-Nan Ning, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), including GPT-3.5, LLaMA, and PaLM, seem to be
+knowledgeable and able to adapt to many tasks. However, we still can not
+completely trust their answer, since LLMs suffer from
+hallucination--fabricating non-existent facts to cheat users without
+perception. And the reasons for their existence and pervasiveness remain
+unclear. In this paper, we demonstrate that non-sense prompts composed of
+random tokens can also elicit the LLMs to respond with hallucinations. This
+phenomenon forces us to revisit that hallucination may be another view of
+adversarial examples, and it shares similar features with conventional
+adversarial examples as the basic feature of LLMs. Therefore, we formalize an
+automatic hallucination triggering method as the hallucination attack in an
+adversarial way. Finally, we explore basic feature of attacked adversarial
+prompts and propose a simple yet effective defense strategy. Our code is
+released on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Entity-Deduction Arena: A playground for probing the conversational
+  reasoning and planning capabilities of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhe Zhang, Jiarui Lu, Navdeep Jaitly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are currently effective at answering questions
+that are clearly asked. However, when faced with ambiguous queries they can act
+unpredictably and produce incorrect outputs. This underscores the need for the
+development of intelligent agents capable of asking clarification questions to
+resolve ambiguities effectively. This capability requires complex
+understanding, state tracking, reasoning and planning over multiple
+conversational turns. However, directly measuring this can be challenging. In
+this paper, we offer a surrogate problem which assesses an LLMs's capability to
+deduce an entity unknown to itself, but revealed to a judge, by asking the
+judge a series of queries. This \textit{entity-deducing game} can serve as an
+evaluation framework to probe the conversational reasoning and planning
+capabilities of language models. We systematically evaluate various LLMs and
+discover significant differences in their performance on this task. We find
+that strong LLMs like GPT-4 outperform human players by a large margin. We
+further employ Behavior Cloning (BC) to examine whether a weaker model is
+capable of imitating a stronger model and generalizing to data or domains,
+using only the demonstrations from a stronger model. We finally propose to use
+Reinforcement Learning to enhance reasoning and planning capacity of Vicuna
+models through episodes of game playing, which lead to significant performance
+improvement. We hope that this problem offers insights into how autonomous
+agents could be trained to behave more intelligently in ambiguous
+circumstances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Merge, Then Compress: Demystify Efficient SMoE with Hints from Its
+  Routing Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pingzhi Li, Zhenyu Zhang, Prateek Yadav, Yi-Lin Sung, Yu Cheng, Mohit Bansal, Tianlong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparsely activated Mixture-of-Experts (SMoE) has shown promise to scale up
+the learning capacity of neural networks, however, they have issues like (a)
+High Memory Usage, due to duplication of the network layers into multiple
+copies as experts; and (b) Redundancy in Experts, as common learning-based
+routing policies suffer from representational collapse. Therefore, vanilla SMoE
+models are memory inefficient and non-scalable, especially for
+resource-constrained downstream scenarios. In this paper, we ask: Can we craft
+a compact SMoE model by consolidating expert information? What is the best
+recipe to merge multiple experts into fewer but more knowledgeable experts? Our
+pilot investigation reveals that conventional model merging methods fail to be
+effective in such expert merging for SMoE. The potential reasons are: (1)
+redundant information overshadows critical experts; (2) appropriate neuron
+permutation for each expert is missing to bring all of them in alignment. To
+address this, we propose M-SMoE, which leverages routing statistics to guide
+expert merging. Specifically, it starts with neuron permutation alignment for
+experts; then, dominant experts and their "group members" are formed; lastly,
+every expert group is merged into a single expert by utilizing each expert's
+activation frequency as their weight for merging, thus diminishing the impact
+of insignificant experts. Moreover, we observed that our proposed merging
+promotes a low dimensionality in the merged expert's weight space, naturally
+paving the way for additional compression. Hence, our final method, MC-SMoE
+(i.e., Merge, then Compress SMoE), further decomposes the merged experts into
+low-rank and structural sparse alternatives. Extensive experiments across 8
+benchmarks validate the effectiveness of MC-SMoE. For instance, our MC-SMoE
+achieves up to 80% memory and a 20% FLOPs reduction, with virtually no loss in
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BTR: Binary Token Representations for Efficient Retrieval Augmented
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingqing Cao, Sewon Min, Yizhong Wang, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval augmentation addresses many critical problems in large language
+models such as hallucination, staleness, and privacy leaks. However, running
+retrieval-augmented language models (LMs) is slow and difficult to scale due to
+processing large amounts of retrieved text. We introduce binary token
+representations (BTR), which use 1-bit vectors to precompute every token in
+passages, significantly reducing computation during inference. Despite the
+potential loss of accuracy, our new calibration techniques and training
+objectives restore performance. Combined with offline and runtime compression,
+this only requires 127GB of disk space for encoding 3 billion tokens in
+Wikipedia. Our experiments show that on five knowledge-intensive NLP tasks, BTR
+accelerates state-of-the-art inference by up to 4x and reduces storage by over
+100x while maintaining over 95% task performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedBPT: Efficient Federated Black-box <span class="highlight-title">Prompt</span> Tuning for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Sun, Ziyue Xu, Hongxu Yin, Dong Yang, Daguang Xu, Yiran Chen, Holger R. Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLM) have revolutionized the NLP landscape,
+achieving stellar performances across diverse tasks. These models, while
+benefiting from vast training data, often require fine-tuning on specific data
+to cater to distinct downstream tasks. However, this data adaptation process
+has inherent security and privacy concerns, primarily when leveraging
+user-generated, device-residing data. Federated learning (FL) provides a
+solution, allowing collaborative model fine-tuning without centralized data
+collection. However, applying FL to finetune PLMs is hampered by challenges,
+including restricted model parameter access, high computational requirements,
+and communication overheads. This paper introduces Federated Black-box Prompt
+Tuning (FedBPT), a framework designed to address these challenges. FedBPT does
+not require the clients to access the model parameters. By focusing on training
+optimal prompts and utilizing gradient-free optimization methods, FedBPT
+reduces the number of exchanged variables, boosts communication efficiency, and
+minimizes computational and storage costs. Experiments highlight the
+framework's ability to drastically cut communication and memory costs while
+maintaining competitive performance. Ultimately, FedBPT presents a promising
+solution for efficient, privacy-preserving fine-tuning of PLM in the age of
+large language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Avalon's Game of Thoughts: Battle Against Deception through Recursive
+  Contemplation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenzhi Wang, Chang Liu, Zilong Zheng, Siyuan Qi, Shuo Chen, Qisen Yang, Andrew Zhao, Chaofei Wang, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent breakthroughs in large language models (LLMs) have brought remarkable
+success in the field of LLM-as-Agent. Nevertheless, a prevalent assumption is
+that the information processed by LLMs is consistently honest, neglecting the
+pervasive deceptive or misleading information in human society and AI-generated
+content. This oversight makes LLMs susceptible to malicious manipulations,
+potentially resulting in detrimental outcomes. This study utilizes the
+intricate Avalon game as a testbed to explore LLMs' potential in deceptive
+environments. Avalon, full of misinformation and requiring sophisticated logic,
+manifests as a "Game-of-Thoughts". Inspired by the efficacy of humans'
+recursive thinking and perspective-taking in the Avalon game, we introduce a
+novel framework, Recursive Contemplation (ReCon), to enhance LLMs' ability to
+identify and counteract deceptive information. ReCon combines formulation and
+refinement contemplation processes; formulation contemplation produces initial
+thoughts and speech, while refinement contemplation further polishes them.
+Additionally, we incorporate first-order and second-order perspective
+transitions into these processes respectively. Specifically, the first-order
+allows an LLM agent to infer others' mental states, and the second-order
+involves understanding how others perceive the agent's mental state. After
+integrating ReCon with different LLMs, extensive experiment results from the
+Avalon game indicate its efficacy in aiding LLMs to discern and maneuver around
+deceptive information without extra fine-tuning and data. Finally, we offer a
+possible explanation for the efficacy of ReCon and explore the current
+limitations of LLMs in terms of safety, reasoning, speaking style, and format,
+potentially furnishing insights for subsequent research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Explanations in Medical Question-Answering by Expectation
+  Maximization Inference over Evidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Sun, Mingxiao Li, Damien Sileo, Jesse Davis, Marie-Francine Moens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical Question Answering~(medical QA) systems play an essential role in
+assisting healthcare workers in finding answers to their questions. However, it
+is not sufficient to merely provide answers by medical QA systems because users
+might want explanations, that is, more analytic statements in natural language
+that describe the elements and context that support the answer. To do so, we
+propose a novel approach for generating natural language explanations for
+answers predicted by medical QA systems. As high-quality medical explanations
+require additional medical knowledge, so that our system extract knowledge from
+medical textbooks to enhance the quality of explanations during the explanation
+generation process. Concretely, we designed an expectation-maximization
+approach that makes inferences about the evidence found in these texts,
+offering an efficient way to focus attention on lengthy evidence passages.
+Experimental results, conducted on two datasets MQAE-diag and MQAE, demonstrate
+the effectiveness of our framework for reasoning with textual evidence. Our
+approach outperforms state-of-the-art models, achieving a significant
+improvement of \textbf{6.86} and \textbf{9.43} percentage points on the Rouge-1
+score; \textbf{8.23} and \textbf{7.82} percentage points on the Bleu-4 score on
+the respective datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-audit: tools to help humans double-check AI-generated content 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew D. Gordon, Carina Negreanu, José Cambronero, Rasika Chakravarthy, Ian Drosos, Hao Fang, Bhaskar Mitra, Hannah Richardson, Advait Sarkar, Stephanie Simmons, Jack Williams, Ben Zorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Users are increasingly being warned to check AI-generated content for
+correctness. Still, as LLMs (and other generative models) generate more complex
+output, such as summaries, tables, or code, it becomes harder for the user to
+audit or evaluate the output for quality or correctness. Hence, we are seeing
+the emergence of tool-assisted experiences to help the user double-check a
+piece of AI-generated content. We refer to these as co-audit tools. Co-audit
+tools complement prompt engineering techniques: one helps the user construct
+the input prompt, while the other helps them check the output response. As a
+specific example, this paper describes recent research on co-audit tools for
+spreadsheet computations powered by generative models. We explain why co-audit
+experiences are essential for any application of generative AI where quality is
+important and errors are consequential (as is common in spreadsheet
+computations). We propose a preliminary list of principles for co-audit, and
+outline research challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Crosswords: Geometric Reasoning over Structured Knowledge with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Ding, Shangbin Feng, Yuhan Liu, Zhaoxuan Tan, Vidhisha Balachandran, Tianxing He, Yulia Tsvetkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are widely adopted in knowledge-intensive tasks
+and have achieved impressive performance thanks to their knowledge abilities.
+While LLMs have demonstrated outstanding performance on atomic or linear
+(multi-hop) QA tasks, whether they can reason in knowledge-rich scenarios with
+interweaving constraints remains an underexplored problem. In this work, we
+propose geometric reasoning over structured knowledge, where pieces of
+knowledge are connected in a graph structure and models need to fill in the
+missing information. Such geometric knowledge reasoning would require the
+ability to handle structured knowledge, reason with uncertainty, verify facts,
+and backtrack when an error occurs. We propose Knowledge Crosswords, a
+multi-blank QA dataset where each problem consists of a natural language
+question representing the geometric constraints of an incomplete entity
+network, where LLMs are tasked with working out the missing entities while
+meeting all factual constraints. Knowledge Crosswords contains 2,101 individual
+problems, covering various knowledge domains and further divided into three
+difficulty levels. We conduct extensive experiments to evaluate existing LLM
+prompting approaches on the Knowledge Crosswords benchmark. We additionally
+propose two new approaches, Staged Prompting and Verify-All, to augment LLMs'
+ability to backtrack and verify structured constraints. Our results demonstrate
+that while baseline approaches perform well on easier problems but struggle
+with hard ones, our proposed Verify-All outperforms other methods by a large
+margin and is more robust with hard problems. Further analysis reveals that
+LLMs' ability of geometric reasoning over structured knowledge is still far
+from robust or perfect, susceptible to confounders such as the order of
+options, certain structural patterns, assumption of existence of correct
+answer, and more.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LEEC: A Legal Element Extraction <span class="highlight-title">Dataset</span> with an Extensive
+  Domain-Specific Label System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Zongyue, Liu Huanghai, Hu Yiran, Kong Kangle, Wang Chenlu, Liu Yun, Shen Weixing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a pivotal task in natural language processing, element extraction has
+gained significance in the legal domain. Extracting legal elements from
+judicial documents helps enhance interpretative and analytical capacities of
+legal cases, and thereby facilitating a wide array of downstream applications
+in various domains of law. Yet existing element extraction datasets are limited
+by their restricted access to legal knowledge and insufficient coverage of
+labels. To address this shortfall, we introduce a more comprehensive,
+large-scale criminal element extraction dataset, comprising 15,831 judicial
+documents and 159 labels. This dataset was constructed through two main steps:
+First, designing the label system by our team of legal experts based on prior
+legal research which identified critical factors driving and processes
+generating sentencing outcomes in criminal cases; Second, employing the legal
+knowledge to annotate judicial documents according to the label system and
+annotation guideline. The Legal Element ExtraCtion dataset (LEEC) represents
+the most extensive and domain-specific legal element extraction dataset for the
+Chinese legal system. Leveraging the annotated data, we employed various SOTA
+models that validates the applicability of LEEC for Document Event Extraction
+(DEE) task. The LEEC dataset is available on https://github.com/THUlawtech/LEEC .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPELL: Semantic <span class="highlight-title">Prompt</span> Evolution based on a LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujian Betterest Li, Kai Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt engineering is a new paradigm for enhancing the performance of trained
+neural network models. For optimizing text-style prompts, existing methods
+usually individually operate small portions of a text step by step, which
+either breaks the fluency or could not globally adjust a prompt. Since large
+language models (LLMs) have powerful ability of generating coherent texts token
+by token, can we utilize LLMs for improving prompts? Based on this motivation,
+in this paper, considering a trained LLM as a text generator, we attempt to
+design a black-box evolution algorithm for automatically optimizing texts,
+namely SPELL (Semantic Prompt Evolution based on a LLM). The proposed method is
+evaluated with different LLMs and evolution parameters in different text tasks.
+Experimental results show that SPELL could rapidly improve the prompts indeed.
+We further explore the evolution process and discuss on the limitations,
+potential possibilities and future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Emotional Expression and Cohesion in Image-Based Playlist
+  Description and Music Topics: A Continuous Parameterization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuelyu Ji, Yuheng Song, Wei Wang, Ruoyi Xu, Zhongqian Xie, Huiyun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text generation in image-based platforms, particularly for music-related
+content, requires precise control over text styles and the incorporation of
+emotional expression. However, existing approaches often need help to control
+the proportion of external factors in generated text and rely on discrete
+inputs, lacking continuous control conditions for desired text generation. This
+study proposes Continuous Parameterization for Controlled Text Generation
+(CPCTG) to overcome these limitations. Our approach leverages a Language Model
+(LM) as a style learner, integrating Semantic Cohesion (SC) and Emotional
+Expression Proportion (EEP) considerations. By enhancing the reward method and
+manipulating the CPCTG level, our experiments on playlist description and music
+topic generation tasks demonstrate significant improvements in ROUGE scores,
+indicating enhanced relevance and coherence in the generated text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ScaLearn: Simple and Highly Parameter-Efficient Task Transfer by
+  Learning to Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Frohmann, Carolin Holtermann, Shahed Masoudian, Anne Lauscher, Navid Rekabsaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-task learning (MTL) has shown considerable practical benefits,
+particularly when using pre-trained language models (PLMs). While this is
+commonly achieved by simultaneously learning $n$ tasks under a joint
+optimization procedure, recent methods such as AdapterFusion structure the
+problem into two distinct stages: (i) task learning, where knowledge specific
+to a task is encapsulated within sets of parameters (\eg adapters), and (ii)
+transfer, where this already learned knowledge is leveraged for a target task.
+This separation of concerns provides numerous benefits, such as promoting
+reusability, and addressing cases involving data privacy and societal concerns;
+on the flip side, current two-stage MTL methods come with the cost of
+introducing a substantial number of additional parameters. In this work, we
+address this issue by leveraging the usefulness of linearly scaling the output
+representations of source adapters for transfer learning. We introduce
+ScaLearn, a simple and highly parameter-efficient two-stage MTL method that
+capitalizes on the knowledge of the source tasks by learning a minimal set of
+scaling parameters that enable effective knowledge transfer to a target task.
+Our experiments on three benchmarks (GLUE, SuperGLUE, and HumSet) show that our
+ScaLearn, in addition to facilitating the benefits of two-stage MTL,
+consistently outperforms strong baselines with only a small number of transfer
+parameters - roughly 0.35% of those of AdapterFusion. Remarkably, we observe
+that ScaLearn maintains its strong abilities even when further reducing
+parameters through uniform scaling and layer-sharing, achieving similarly
+competitive results with only $8$ transfer parameters for each target task. Our
+proposed approach thus demonstrates the power of simple scaling as a promise
+for more efficient task transfer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label Supervised LLaMA Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxi Li, Xianming Li, Yuzhang Liu, Haoran Xie, Jing Li, Fu-lee Wang, Qing Li, Xiaoqin Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent success of Large Language Models (LLMs) has gained significant
+attention in both academia and industry. Substantial efforts have been made to
+enhance the zero- and few-shot generalization capabilities of open-source LLMs
+through finetuning. Currently, the prevailing approach is instruction-tuning,
+which trains LLMs to complete real-world tasks by generating responses guided
+by natural language instructions. It is worth noticing that such an approach
+may underperform in sequence and token classification tasks. Unlike text
+generation tasks, classification tasks have a limited label space, where
+precise label prediction is more appreciated than generating diverse and
+human-like responses. Prior research has unveiled that instruction-tuned LLMs
+cannot outperform BERT, prompting us to explore the potential of leveraging
+latent representations from LLMs for supervised label prediction. In this
+paper, we introduce a label-supervised adaptation for LLMs, which aims to
+finetuning the model with discriminant labels. We evaluate this approach with
+Label Supervised LLaMA (LS-LLaMA), based on LLaMA-2-7B, a relatively
+small-scale LLM, and can be finetuned on a single GeForce RTX4090 GPU. We
+extract latent representations from the final LLaMA layer and project them into
+the label space to compute the cross-entropy loss. The model is finetuned by
+Low-Rank Adaptation (LoRA) to minimize this loss. Remarkably, without intricate
+prompt engineering or external knowledge, LS-LLaMA substantially outperforms
+LLMs ten times its size in scale and demonstrates consistent improvements
+compared to robust baselines like BERT-Large and RoBERTa-Large in text
+classification. Moreover, by removing the causal mask from decoders, LS-unLLaMA
+achieves the state-of-the-art performance in named entity recognition (NER).
+Our work will shed light on a novel approach to adapting LLMs for various
+downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying the Plausibility of Context Reliance in Neural Machine
+  Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Sarti, Grzegorz Chrupała, Malvina Nissim, Arianna Bisazza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Establishing whether language models can use contextual information in a
+human-plausible way is important to ensure their safe adoption in real-world
+settings. However, the questions of when and which parts of the context affect
+model generations are typically tackled separately, and current plausibility
+evaluations are practically limited to a handful of artificial benchmarks. To
+address this, we introduce Plausibility Evaluation of Context Reliance
+(PECoRe), an end-to-end interpretability framework designed to quantify context
+usage in language models' generations. Our approach leverages model internals
+to (i) contrastively identify context-sensitive target tokens in generated
+texts and (ii) link them to contextual cues justifying their prediction. We use
+PECoRe to quantify the plausibility of context-aware machine translation
+models, comparing model rationales with human annotations across several
+discourse-level phenomena. Finally, we apply our method to unannotated
+generations to identify context-mediated predictions and highlight instances of
+(im)plausible context usage in model translations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, under review. 24 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NarrativePlay: Interactive Narrative Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runcong Zhao, Wenjia Zhang, Jiazheng Li, Lixing Zhu, Yanran Li, Yulan He, Lin Gui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce NarrativePlay, a novel system that allows users
+to role-play a fictional character and interact with other characters in
+narratives such as novels in an immersive environment. We leverage Large
+Language Models (LLMs) to generate human-like responses, guided by personality
+traits extracted from narratives. The system incorporates auto-generated visual
+display of narrative settings, character portraits, and character speech,
+greatly enhancing user experience. Our approach eschews predefined sandboxes,
+focusing instead on main storyline events extracted from narratives from the
+perspective of a user-selected character. NarrativePlay has been evaluated on
+two types of narratives, detective and adventure stories, where users can
+either explore the world or improve their favorability with the narrative
+characters through conversations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Target-Aware Contextual Political Bias Detection in News <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iffat Maab, Edison Marrese-Taylor, Yutaka Matsuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Media bias detection requires comprehensive integration of information
+derived from multiple news sources. Sentence-level political bias detection in
+news is no exception, and has proven to be a challenging task that requires an
+understanding of bias in consideration of the context. Inspired by the fact
+that humans exhibit varying degrees of writing styles, resulting in a diverse
+range of statements with different local and global contexts, previous work in
+media bias detection has proposed augmentation techniques to exploit this fact.
+Despite their success, we observe that these techniques introduce noise by
+over-generalizing bias context boundaries, which hinders performance. To
+alleviate this issue, we propose techniques to more carefully search for
+context using a bias-sensitive, target-aware approach for data augmentation.
+Comprehensive experiments on the well-known BASIL dataset show that when
+combined with pre-trained models such as BERT, our augmentation techniques lead
+to state-of-the-art results. Our approach outperforms previous methods
+significantly, obtaining an F1-score of 58.15 over state-of-the-art bias
+detection task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures, conference paper accepted in IJCNLP-AACL 2023
+  but will get published after Nov 4th Bali conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Evaluation of Classroom Instructional Support with LLMs and
+  BoWs: Connecting Global Predictions to Specific Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Whitehill, Jennifer LoCasale-Crouch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the aim to provide teachers with more specific, frequent, and actionable
+feedback about their teaching, we explore how Large Language Models (LLMs) can
+be used to estimate ``Instructional Support'' domain scores of the CLassroom
+Assessment Scoring System (CLASS), a widely used observation protocol. We
+design a machine learning architecture that uses either zero-shot prompting of
+Meta's Llama2, and/or a classic Bag of Words (BoW) model, to classify
+individual utterances of teachers' speech (transcribed automatically using
+OpenAI's Whisper) for the presence of 11 behavioral indicators of Instructional
+Support. Then, these utterance-level judgments are aggregated over an entire
+15-min observation session to estimate a global CLASS score. Experiments on two
+CLASS-coded datasets of toddler and pre-kindergarten classrooms indicate that
+(1) automatic CLASS Instructional Support estimation accuracy using the
+proposed method (Pearson $R$ up to $0.46$) approaches human inter-rater
+reliability (up to $R=0.55$); (2) LLMs yield slightly greater accuracy than BoW
+for this task; and (3) the best models often combined features extracted from
+both LLM and BoW. Finally, (4) we illustrate how the model's outputs can be
+visualized at the utterance level to provide teachers with explainable feedback
+on which utterances were most positively or negatively correlated with specific
+CLASS dimensions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The in-context learning ability of large language models (LLMs) enables them
+to generalize to novel downstream tasks with relatively few labeled examples.
+However, they require enormous computational resources to be deployed.
+Alternatively, smaller models can solve specific tasks if fine-tuned with
+enough labeled examples. These examples, however, are expensive to obtain. In
+pursuit of the best of both worlds, we study the annotation and generation of
+fine-tuning training data via fine-tuned teacher LLMs to improve the downstream
+performance of much smaller models. In four text classification and two text
+generation tasks, we find that both data generation and annotation dramatically
+improve the respective downstream model's performance, occasionally
+necessitating only a minor fraction of the original training dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphText: Graph Reasoning in Text Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Zhao, Le Zhuo, Yikang Shen, Meng Qu, Kai Liu, Michael Bronstein, Zhaocheng Zhu, Jian Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have gained the ability to assimilate human
+knowledge and facilitate natural language interactions with both humans and
+other LLMs. However, despite their impressive achievements, LLMs have not made
+significant advancements in the realm of graph machine learning. This
+limitation arises because graphs encapsulate distinct relational data, making
+it challenging to transform them into natural language that LLMs understand. In
+this paper, we bridge this gap with a novel framework, GraphText, that
+translates graphs into natural language. GraphText derives a graph-syntax tree
+for each graph that encapsulates both the node attributes and inter-node
+relationships. Traversal of the tree yields a graph text sequence, which is
+then processed by an LLM to treat graph tasks as text generation tasks.
+Notably, GraphText offers multiple advantages. It introduces training-free
+graph reasoning: even without training on graph data, GraphText with ChatGPT
+can achieve on par with, or even surpassing, the performance of
+supervised-trained graph neural networks through in-context learning (ICL).
+Furthermore, GraphText paves the way for interactive graph reasoning, allowing
+both humans and LLMs to communicate with the model seamlessly using natural
+language. These capabilities underscore the vast, yet-to-be-explored potential
+of LLMs in the domain of graph machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards human-like spoken dialogue generation between AI agents from
+  written dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kentaro Mitsui, Yukiya Hono, Kei Sawada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large language models (LLMs) has made it possible to generate
+natural written dialogues between two agents. However, generating human-like
+spoken dialogues from these written dialogues remains challenging. Spoken
+dialogues have several unique characteristics: they frequently include
+backchannels and laughter, and the smoothness of turn-taking significantly
+influences the fluidity of conversation. This study proposes CHATS - CHatty
+Agents Text-to-Speech - a discrete token-based system designed to generate
+spoken dialogues based on written dialogues. Our system can generate speech for
+both the speaker side and the listener side simultaneously, using only the
+transcription from the speaker side, which eliminates the need for
+transcriptions of backchannels or laughter. Moreover, CHATS facilitates natural
+turn-taking; it determines the appropriate duration of silence after each
+utterance in the absence of overlap, and it initiates the generation of
+overlapping speech based on the phoneme sequence of the next utterance in case
+of overlap. Experimental evaluations indicate that CHATS outperforms the
+text-to-speech baseline, producing spoken dialogues that are more interactive
+and fluid while retaining clarity and intelligibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures, 9 tables, audio samples:
+  https://rinnakk.github.io/research/publications/CHATS/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Back to the Future: Towards Explainable Temporal Reasoning with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhan Yuan, Qianqian Xie, Jimin Huang, Sophia Ananiadou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal reasoning is a crucial NLP task, providing a nuanced understanding
+of time-sensitive contexts within textual data. Although recent advancements in
+LLMs have demonstrated their potential in temporal reasoning, the predominant
+focus has been on tasks such as temporal expression and temporal relation
+extraction. These tasks are primarily designed for the extraction of direct and
+past temporal cues and to engage in simple reasoning processes. A significant
+gap remains when considering complex reasoning tasks such as event forecasting,
+which requires multi-step temporal reasoning on events and prediction on the
+future timestamp. Another notable limitation of existing methods is their
+incapability to provide an illustration of their reasoning process, hindering
+explainability. In this paper, we introduce the first task of explainable
+temporal reasoning, to predict an event's occurrence at a future timestamp
+based on context which requires multiple reasoning over multiple events, and
+subsequently provide a clear explanation for their prediction. Our task offers
+a comprehensive evaluation of both the LLMs' complex temporal reasoning
+ability, the future event prediction ability, and explainability-a critical
+attribute for AI applications. To support this task, we present the first
+multi-source instruction-tuning dataset of explainable temporal reasoning
+(ExpTime) with 26k derived from the temporal knowledge graph datasets and their
+temporal reasoning paths, using a novel knowledge-graph-instructed-generation
+strategy. Based on the dataset, we propose the first open-source LLM series
+TimeLlaMA based on the foundation LlaMA2, with the ability of instruction
+following for explainable temporal reasoning. We compare the performance of our
+method and a variety of LLMs, where our method achieves the state-of-the-art
+performance of temporal prediction and explanation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning on Graphs: Faithful and Interpretable Large Language Model
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linhao Luo, Yuan-Fang Li, Gholamreza Haffari, Shirui Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated impressive reasoning abilities
+in complex tasks. However, they lack up-to-date knowledge and experience
+hallucinations during reasoning, which can lead to incorrect reasoning
+processes and diminish their performance and trustworthiness. Knowledge graphs
+(KGs), which capture vast amounts of facts in a structured format, offer a
+reliable source of knowledge for reasoning. Nevertheless, existing KG-based LLM
+reasoning methods only treat KGs as factual knowledge bases and overlook the
+importance of their structural information for reasoning. In this paper, we
+propose a novel method called reasoning on graphs (RoG) that synergizes LLMs
+with KGs to enable faithful and interpretable reasoning. Specifically, we
+present a planning-retrieval-reasoning framework, where RoG first generates
+relation paths grounded by KGs as faithful plans. These plans are then used to
+retrieve valid reasoning paths from the KGs for LLMs to conduct faithful
+reasoning. Furthermore, RoG not only distills knowledge from KGs to improve the
+reasoning ability of LLMs through training but also allows seamless integration
+with any arbitrary LLMs during inference. Extensive experiments on two
+benchmark KGQA datasets demonstrate that RoG achieves state-of-the-art
+performance on KG reasoning tasks and generates faithful and interpretable
+reasoning results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tool-Augmented Reward Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Yekun Chai, Shuohuan Wang, Yu Sun, Hao Tian, Ningyu Zhang, Hua Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reward modeling (a.k.a., preference modeling) is instrumental for aligning
+large language models with human preferences, particularly within the context
+of reinforcement learning from human feedback (RLHF). While conventional reward
+models (RMs) have exhibited remarkable scalability, they oft struggle with
+fundamental functionality such as arithmetic computation, code execution, and
+factual lookup. In this paper, we propose a tool-augmented preference modeling
+approach, named \name, to address these limitations by empowering RMs with
+access to external environments, including calculators and search engines. This
+approach not only fosters synergy between tool utilization and reward grading
+but also enhances interpretive capacity and scoring reliability. Our study
+delves into the integration of external tools into RMs, enabling them to
+interact with diverse external sources and construct task-specific tool
+engagement and reasoning traces in an autoregressive manner. We validate our
+approach across a wide range of domains, incorporating seven distinct external
+tools. Our experimental results demonstrate a noteworthy overall improvement of
+17.7% across eight tasks in preference ranking. Furthermore, our approach
+outperforms Gopher 280B by 7.3% on TruthfulQA task in zero-shot evaluation. In
+human evaluations, RLHF trained with Themis attains an average win rate of 32%
+when compared to baselines across four distinct tasks. Additionally, we provide
+a comprehensive collection of tool-related RM datasets, incorporating data from
+seven distinct tool APIs, totaling 15,000 instances. We anticipate that this
+publicly available dataset will facilitate and inspire further research
+advancements in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Model Decoding as Direct Metrics Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Ji, Pei Ke, Hongning Wang, Minlie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable advances in language modeling, current mainstream
+decoding methods still struggle to generate texts that align with human texts
+across different aspects. In particular, sampling-based methods produce
+less-repetitive texts which are often disjunctive in discourse, while
+search-based methods maintain topic coherence at the cost of increased
+repetition. Overall, these methods fall short in achieving holistic alignment
+across a broad range of aspects. In this work, we frame decoding from a
+language model as an optimization problem with the goal of strictly matching
+the expected performance with human texts measured by multiple metrics of
+desired aspects simultaneously. The resulting decoding distribution enjoys an
+analytical solution that scales the input language model distribution via a
+sequence-level energy function defined by these metrics. And most importantly,
+we prove that this induced distribution is guaranteed to improve the perplexity
+on human texts, which suggests a better approximation to the underlying
+distribution of human texts. To facilitate tractable sampling from this
+globally normalized distribution, we adopt the Sampling-Importance-Resampling
+technique. Experiments on various domains and model scales demonstrate the
+superiority of our method in metrics alignment with human texts and human
+evaluation over strong baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. 28 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARN: A Comprehensive Framework and <span class="highlight-title">Dataset</span> for Analogical Reasoning on
+  Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhivar Sourati, Filip Ilievski, Pia Sommerauer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analogical reasoning is one of the prime abilities of humans and is linked to
+creativity and scientific discoveries. This ability has been studied
+extensively in natural language processing (NLP) as well as in cognitive
+psychology by proposing various benchmarks and evaluation setups. Yet, a
+substantial gap exists between evaluations of analogical reasoning in cognitive
+psychology and NLP. Our aim is to bridge this by computationally adapting
+theories related to analogical reasoning from cognitive psychology in the
+context of narratives and developing an evaluation framework large in scale.
+More concretely, we propose the task of matching narratives based on system
+mappings and release the Analogical Reasoning on Narratives (ARN) dataset. To
+create the dataset, we devise a framework inspired by cognitive psychology
+theories about analogical reasoning to utilize narratives and their components
+to form mappings of different abstractness levels. These mappings are then
+leveraged to create pairs of analogies and disanalogies/distractors with more
+than 1k triples of query narratives, analogies, and distractors. We cover four
+categories of far/near analogies and far/near distractors that allow us to
+study analogical reasoning in models from distinct perspectives. In this study,
+we evaluate different large language models (LLMs) on this task. Our results
+demonstrate that LLMs struggle to recognize higher-order mappings when they are
+not accompanied by lower-order mappings (far analogies) and show better
+performance when all mappings are present simultaneously (near analogies). We
+observe that in all the settings, the analogical reasoning abilities of LLMs
+can be easily impaired by near distractors that form lower-order mappings with
+the query narratives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EALM: Introducing Multidimensional Ethical Alignment in Conversational
+  Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyao Yu, Junjie Wang, Yuxiang Zhang, Lin Zhang, Yujiu Yang, Tetsuya Sakai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) technologies should adhere to human norms to
+better serve our society and avoid disseminating harmful or misleading
+information, particularly in Conversational Information Retrieval (CIR).
+Previous work, including approaches and datasets, has not always been
+successful or sufficiently robust in taking human norms into consideration. To
+this end, we introduce a workflow that integrates ethical alignment, with an
+initial ethical judgment stage for efficient data screening. To address the
+need for ethical judgment in CIR, we present the QA-ETHICS dataset, adapted
+from the ETHICS benchmark, which serves as an evaluation tool by unifying
+scenarios and label meanings. However, each scenario only considers one ethical
+concept. Therefore, we introduce the MP-ETHICS dataset to evaluate a scenario
+under multiple ethical concepts, such as justice and Deontology. In addition,
+we suggest a new approach that achieves top performance in both binary and
+multi-label ethical judgment tasks. Our research provides a practical method
+for introducing ethical alignment into the CIR workflow. The data and code are
+available at https://github.com/wanng-ide/ealm .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resolving Knowledge Conflicts in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yike Wang, Shangbin Feng, Heng Wang, Weijia Shi, Vidhisha Balachandran, Tianxing He, Yulia Tsvetkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often encounter knowledge conflicts, scenarios
+where discrepancy arises between the internal parametric knowledge of LLMs and
+non-parametric information provided in the prompt context. In this work we ask
+what are the desiderata for LLMs when a knowledge conflict arises and whether
+existing LLMs fulfill them. We posit that LLMs should 1) identify knowledge
+conflicts, 2) pinpoint conflicting information segments, and 3) provide
+distinct answers or viewpoints in conflicting scenarios. To this end, we
+introduce KNOWLEDGE CONFLICT, an evaluation framework for simulating contextual
+knowledge conflicts and quantitatively evaluating to what extent LLMs achieve
+these goals. KNOWLEDGE CONFLICT includes diverse and complex situations of
+knowledge conflict, knowledge from diverse entities and domains, two synthetic
+conflict creation methods, and settings with progressively increasing
+difficulty to reflect realistic knowledge conflicts. Extensive experiments with
+the KNOWLEDGE CONFLICT framework reveal that while LLMs perform well in
+identifying the existence of knowledge conflicts, they struggle to determine
+the specific conflicting knowledge and produce a response with distinct answers
+amidst conflicting information. To address these challenges, we propose new
+instruction-based approaches that augment LLMs to better achieve the three
+goals. Further analysis shows that abilities to tackle knowledge conflicts are
+greatly impacted by factors such as knowledge domain and prompt text, while
+generating robust responses to knowledge conflict scenarios remains an open
+research question.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fooling the Textual Fooler via Randomizing Latent Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy C. Hoang, Quang H. Nguyen, Saurav Manchanda, MinLong Peng, Kok-Seng Wong, Khoa D. Doan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite outstanding performance in a variety of NLP tasks, recent studies
+have revealed that NLP models are vulnerable to adversarial attacks that
+slightly perturb the input to cause the models to misbehave. Among these
+attacks, adversarial word-level perturbations are well-studied and effective
+attack strategies. Since these attacks work in black-box settings, they do not
+require access to the model architecture or model parameters and thus can be
+detrimental to existing NLP applications. To perform an attack, the adversary
+queries the victim model many times to determine the most important words in an
+input text and to replace these words with their corresponding synonyms. In
+this work, we propose a lightweight and attack-agnostic defense whose main goal
+is to perplex the process of generating an adversarial example in these
+query-based black-box attacks; that is to fool the textual fooler. This
+defense, named AdvFooler, works by randomizing the latent representation of the
+input at inference time. Different from existing defenses, AdvFooler does not
+necessitate additional computational overhead during training nor relies on
+assumptions about the potential adversarial perturbation set while having a
+negligible impact on the model's accuracy. Our theoretical and empirical
+analyses highlight the significance of robustness resulting from confusing the
+adversary via randomizing the latent space, as well as the impact of
+randomization on clean accuracy. Finally, we empirically demonstrate near
+state-of-the-art robustness of AdvFooler against representative adversarial
+word-level attacks on two benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All Languages Matter: On the Multilingual Safety of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Wang, Zhaopeng Tu, Chang Chen, Youliang Yuan, Jen-tse Huang, Wenxiang Jiao, Michael R. Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety lies at the core of developing and deploying large language models
+(LLMs). However, previous safety benchmarks only concern the safety in one
+language, e.g. the majority language in the pretraining data such as English.
+In this work, we build the first multilingual safety benchmark for LLMs,
+XSafety, in response to the global deployment of LLMs in practice. XSafety
+covers 14 kinds of commonly used safety issues across 10 languages that span
+several language families. We utilize XSafety to empirically study the
+multilingual safety for 4 widely-used LLMs, including both close-API and
+open-source models. Experimental results show that all LLMs produce
+significantly more unsafe responses for non-English queries than English ones,
+indicating the necessity of developing safety alignment for non-English
+languages. In addition, we propose several simple and effective prompting
+methods to improve the multilingual safety of ChatGPT by evoking safety
+knowledge and improving cross-lingual generalization of safety alignment. Our
+prompting method can significantly reduce the ratio of unsafe responses from
+19.1% to 9.7% for non-English queries. We release our data at
+https://github.com/Jarviswang94/Multilingual_safety_benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first multilingual safety benchmark for large language models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TADIS: Steering Models for Deep-Thinking about Demonstration Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianci Xue, Ziqi Wang, Yixia Li, Yun Chen, Guanhua Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning has been demonstrated that could significantly improve the
+zero-shot generalization capability to unseen tasks by an apparent margin. By
+incorporating additional context (e.g., task definition, examples) during the
+fine-tuning process, Large Language Models (LLMs) achieved much higher
+performance than before. However, recent work reported that delusive task
+examples can achieve almost the same performance as correct task examples,
+indicating the input-label correspondence is less important than previously
+thought. Intrigued by this counter-intuitive observation, we suspect models
+have the same illusion of competence as humans. Therefore, we propose a novel
+method called TADIS that steers LLMs for "Deep-Thinking'' about demonstration
+examples instead of merely seeing. To alleviate the illusion of competence of
+models, we first ask the model to verify the correctness of shown examples.
+Then, using the verification results as conditions to elicit models for a
+better answer. Our experimental results show that TADIS consistently
+outperforms competitive baselines on in-domain and out-domain tasks (improving
+2.79 and 4.03 average ROUGLE-L on out-domain and in-domain datasets,
+respectively). Despite the presence of generated examples (not all of the
+thinking labels are accurate), TADIS can notably enhance performance in
+zero-shot and few-shot settings. This also suggests that our approach can be
+adopted on a large scale to improve the instruction following capabilities of
+models without any manual labor. Moreover, we construct three types of thinking
+labels with different model sizes and find that small models learn from the
+format of TADIS but larger models can be steered for "Deep-Thinking''.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ uSee: Unified Speech Enhancement and Editing with Conditional Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muqiao Yang, Chunlei Zhang, Yong Xu, Zhongweiyang Xu, Heming Wang, Bhiksha Raj, Dong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech enhancement aims to improve the quality of speech signals in terms of
+quality and intelligibility, and speech editing refers to the process of
+editing the speech according to specific user needs. In this paper, we propose
+a Unified Speech Enhancement and Editing (uSee) model with conditional
+diffusion models to handle various tasks at the same time in a generative
+manner. Specifically, by providing multiple types of conditions including
+self-supervised learning embeddings and proper text prompts to the score-based
+diffusion model, we can enable controllable generation of the unified speech
+enhancement and editing model to perform corresponding actions on the source
+speech. Our experiments show that our proposed uSee model can achieve superior
+performance in both speech denoising and dereverberation compared to other
+related generative speech enhancement models, and can perform speech editing
+given desired environmental sound text description, signal-to-noise ratios
+(SNR), and room impulse responses (RIR). Demos of the generated speech are
+available at https://muqiaoy.github.io/usee.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enable Language Models to Implicitly Learn Self-Improvement From Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Wang, Le Hou, Tianjian Lu, Yuexin Wu, Yunxuan Li, Hongkun Yu, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable capabilities in
+open-ended text generation tasks. However, the inherent open-ended nature of
+these tasks implies that there is always room for improvement in the quality of
+model responses. To address this challenge, various approaches have been
+proposed to enhance the performance of LLMs. There has been a growing focus on
+enabling LLMs to self-improve their response quality, thereby reducing the
+reliance on extensive human annotation efforts for collecting diverse and
+high-quality training data. Recently, prompting-based methods have been widely
+explored among self-improvement methods owing to their effectiveness,
+efficiency, and convenience. However, those methods usually require explicitly
+and thoroughly written rubrics as inputs to LLMs. It is expensive and
+challenging to manually derive and provide all necessary rubrics with a
+real-world complex goal for improvement (e.g., being more helpful and less
+harmful). To this end, we propose an ImPlicit Self-ImprovemenT (PIT) framework
+that implicitly learns the improvement goal from human preference data. PIT
+only requires preference data that are used to train reward models without
+extra human efforts. Specifically, we reformulate the training objective of
+reinforcement learning from human feedback (RLHF) -- instead of maximizing
+response quality for a given input, we maximize the quality gap of the response
+conditioned on a reference response. In this way, PIT is implicitly trained
+with the improvement goal of better aligning with human preferences.
+Experiments on two real-world datasets and one synthetic dataset show that our
+method significantly outperforms prompting-based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No Offense Taken: Eliciting Offensiveness from Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anugya Srivastava, Rahul Ahuja, Rohith Mukku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work was completed in May 2022.
+  For safe and reliable deployment of language models in the real world,
+testing needs to be robust. This robustness can be characterized by the
+difficulty and diversity of the test cases we evaluate these models on.
+Limitations in human-in-the-loop test case generation has prompted an advent of
+automated test case generation approaches. In particular, we focus on Red
+Teaming Language Models with Language Models by Perez et al.(2022). Our
+contributions include developing a pipeline for automated test case generation
+via red teaming that leverages publicly available smaller language models
+(LMs), experimenting with different target LMs and red classifiers, and
+generating a corpus of test cases that can help in eliciting offensive
+responses from widely deployed LMs and identifying their failure modes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ (Dynamic) <span class="highlight-title">Prompt</span>ing might be all you need to repair Compressed LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duc N. M Hoang, Minsik Cho, Thomas Merth, Mohammad Rastegari, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs), while transformative for NLP, come with
+significant computational demands, underlining the need for efficient,
+training-free compression. Notably, the reliability of perplexity as a
+benchmark for compressed model efficacy is in question, as our tests using
+LLaMA-7B and OPT-6.7b reveal a significant performance drop in several
+realistic downstream tasks, underscoring the disparity between perplexity as a
+performance indicator and real-world performance. Investigation into the
+trade-off between resource-intensive post-compression re-training highlights
+the prospect of prompt-driven recovery as a lightweight adaption tool. However,
+existing studies, confined mainly to perplexity evaluations and simple tasks,
+fail to offer unequivocal confidence in the scalability and generalizability of
+prompting. We tackle this uncertainty in two key ways. First, we uncover the
+vulnerability of naive prompts in LLM compression as an over-reliance on a
+singular prompt per input. In response, we propose inference-time dynamic
+prompting (IDP), a mechanism that autonomously chooses from a set of curated
+prompts based on the context of each individual input. Second, we delve into a
+scientific understanding of why ``prompting might be all you need post-LLM
+compression". Our findings suggest that compression doesn't irretrievably erase
+LLM model knowledge but displace it, necessitating a new inference path. IDP
+effectively redirects this path, enabling the model to tap into its inherent
+yet displaced knowledge and thereby recover performance. Empirical tests affirm
+the value of IDP, demonstrating an average performance improvement of 1.24%
+across nine varied tasks spanning multiple knowledge domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Melody-conditioned lyrics generation via fine-tuning language model and
+  its evaluation with Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Zhang, Karol Lasocki, Yi Yu, Atsuhiro Takasu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We leverage character-level language models for syllable-level lyrics
+generation from symbolic melody. By fine-tuning a character-level pre-trained
+model, we integrate language knowledge into the beam search of a syllable-level
+Transformer generator. Using ChatGPT-based evaluations, we demonstrate enhanced
+coherence and correctness in the generated lyrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of frozen large-scale models to multimodal task-oriented
+  dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuki Kawamoto, Takuma Suzuki, Ko Miyama, Takumi Meguro, Tomohiro Takagi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we use the existing Large Language Models ENnhanced to See
+Framework (LENS Framework) to test the feasibility of multimodal task-oriented
+dialogues. The LENS Framework has been proposed as a method to solve computer
+vision tasks without additional training and with fixed parameters of
+pre-trained models. We used the Multimodal Dialogs (MMD) dataset, a multimodal
+task-oriented dialogue benchmark dataset from the fashion field, and for the
+evaluation, we used the ChatGPT-based G-EVAL, which only accepts textual
+modalities, with arrangements to handle multimodal data. Compared to
+Transformer-based models in previous studies, our method demonstrated an
+absolute lift of 10.8% in fluency, 8.8% in usefulness, and 5.2% in relevance
+and coherence. The results show that using large-scale models with fixed
+parameters rather than using models trained on a dataset from scratch improves
+performance in multimodal task-oriented dialogues. At the same time, we show
+that Large Language Models (LLMs) are effective for multimodal task-oriented
+dialogues. This is expected to lead to efficient applications to existing
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction <span class="highlight-title">Dataset</span> for
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08018v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08018v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xiaozhuan Liang, Ningyu Zhang, Kangwei Liu, Rui Huang, Zhuo Chen, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), with their remarkable task-handling
+capabilities and innovative outputs, have catalyzed significant advancements
+across a spectrum of fields. However, their proficiency within specialized
+domains such as biomolecular studies remains limited. To address this
+challenge, we introduce Mol-Instructions, a comprehensive instruction dataset
+designed for the biomolecular domain. Mol-Instructions encompasses three key
+components: molecule-oriented instructions, protein-oriented instructions, and
+biomolecular text instructions. Each component aims to improve the
+understanding and prediction capabilities of LLMs concerning biomolecular
+features and behaviors. Through extensive instruction tuning experiments on
+LLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large
+models' performance in the intricate realm of biomolecular studies, thus
+fostering progress in the biomolecular research community. Mol-Instructions is
+publicly available for ongoing research and will undergo regular updates to
+enhance its applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project homepage: https://github.com/zjunlp/Mol-Instructions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing the Evaluation of Traditional Chinese Language Models: Towards
+  a Comprehensive Benchmark Suite 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan-Jan Hsu, Chang-Le Liu, Feng-Ting Liao, Po-Chun Hsu, Yi-Chang Chen, Da-shan Shiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of large language models is an essential task in the field of
+language understanding and generation. As language models continue to advance,
+the need for effective benchmarks to assess their performance has become
+imperative. In the context of Traditional Chinese, there is a scarcity of
+comprehensive and diverse benchmarks to evaluate the capabilities of language
+models, despite the existence of certain benchmarks such as DRCD, TTQA, CMDQA,
+and FGC dataset. To address this gap, we propose a novel set of benchmarks that
+leverage existing English datasets and are tailored to evaluate language models
+in Traditional Chinese. These benchmarks encompass a wide range of tasks,
+including contextual question-answering, summarization, classification, and
+table understanding. The proposed benchmarks offer a comprehensive evaluation
+framework, enabling the assessment of language models' capabilities across
+different tasks. In this paper, we evaluate the performance of GPT-3.5,
+Taiwan-LLaMa-v1.0, and Model 7-C, our proprietary model, on these benchmarks.
+The evaluation results highlight that our model, Model 7-C, achieves
+performance comparable to GPT-3.5 with respect to a part of the evaluated
+capabilities. In an effort to advance the evaluation of language models in
+Traditional Chinese and stimulate further research in this field, we have
+open-sourced our benchmark and opened the model for trial.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-Agnostic Molecular Generation with Self-feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11259v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11259v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Ningyu Zhang, Zhuo Chen, Lingbing Guo, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of molecules with desired properties has gained tremendous
+popularity, revolutionizing the way scientists design molecular structures and
+providing valuable support for chemical and drug design. However, despite the
+potential of language models in molecule generation, they face numerous
+challenges such as the generation of syntactically or chemically flawed
+molecules, narrow domain focus, and limitations in creating diverse and
+directionally feasible molecules due to a dearth of annotated data or external
+molecular databases. To tackle these challenges, we introduce MolGen, a
+pre-trained molecular language model tailored specifically for molecule
+generation. Through the reconstruction of over 100 million molecular SELFIES,
+MolGen internalizes profound structural and grammatical insights. This is
+further enhanced by domain-agnostic molecular prefix tuning, fostering robust
+knowledge transfer across diverse domains. Importantly, our self-feedback
+paradigm steers the model away from ``molecular hallucinations'', ensuring
+alignment between the model's estimated probabilities and real-world chemical
+preferences. Extensive experiments on well-known benchmarks underscore MolGen's
+optimization capabilities in properties such as penalized logP, QED, and
+molecular docking. Additional analyses affirm its proficiency in accurately
+capturing molecule distributions, discerning intricate structural patterns, and
+efficiently exploring the chemical space. Code is available at
+https://github.com/zjunlp/MolGen.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IFAN: An Explainability-Focused Interaction Framework for Humans and NLP
+  Models <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edoardo Mosca, Daryna Dementieva, Tohid Ebrahim Ajdari, Maximilian Kummeth, Kirill Gringauz, Yutong Zhou, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability and human oversight are fundamental pillars of deploying
+complex NLP models into real-world applications. However, applying
+explainability and human-in-the-loop methods requires technical proficiency.
+Despite existing toolkits for model understanding and analysis, options to
+integrate human feedback are still limited. We propose IFAN, a framework for
+real-time explanation-based interaction with NLP models. Through IFAN's
+interface, users can provide feedback to selected model explanations, which is
+then integrated through adapter layers to align the model with human rationale.
+We show the system to be effective in debiasing a hate speech classifier with
+minimal impact on performance. IFAN also offers a visual admin system and API
+to manage models (and datasets) as well as control access rights. A demo is
+live at https://ifan.ml.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AACL 2023 Demonstration systems Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14279v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14279v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelica Chen, Jason Phang, Alicia Parrish, Vishakh Padmakumar, Chen Zhao, Samuel R. Bowman, Kyunghyun Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved widespread success on a variety of
+in-context few-shot tasks, but this success is typically evaluated via
+correctness rather than consistency. We argue that self-consistency is an
+important criteria for valid multi-step reasoning in tasks where the solution
+is composed of the answers to multiple sub-steps. We propose two types of
+self-consistency that are particularly important for multi-step reasoning --
+hypothetical consistency (a model's ability to predict what its output would be
+in a hypothetical other context) and compositional consistency (consistency of
+a model's final outputs when intermediate sub-steps are replaced with the
+model's outputs for those steps). We demonstrate that multiple variants of the
+GPT-3/-4 models exhibit poor consistency rates across both types of consistency
+on a variety of tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added GPT-4 results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factify 2: A Multimodal Fake News and Satire News <span class="highlight-title">Dataset</span> <span class="chip">AAAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03897v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03897v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S Suryavardan, Shreyash Mishra, Parth Patwa, Megha Chakraborty, Anku Rani, Aishwarya Reganti, Aman Chadha, Amitava Das, Amit Sheth, Manoj Chinnakotla, Asif Ekbal, Srijan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The internet gives the world an open platform to express their views and
+share their stories. While this is very valuable, it makes fake news one of our
+society's most pressing problems. Manual fact checking process is time
+consuming, which makes it challenging to disprove misleading assertions before
+they cause significant harm. This is he driving interest in automatic fact or
+claim verification. Some of the existing datasets aim to support development of
+automating fact-checking techniques, however, most of them are text based.
+Multi-modal fact verification has received relatively scant attention. In this
+paper, we provide a multi-modal fact-checking dataset called FACTIFY 2,
+improving Factify 1 by using new data sources and adding satire articles.
+Factify 2 has 50,000 new data instances. Similar to FACTIFY 1.0, we have three
+broad categories - support, no-evidence, and refute, with sub-categories based
+on the entailment of visual and textual data. We also provide a BERT and Vison
+Transformer based baseline, which achieves 65% F1 score in the test set. The
+baseline codes and the dataset will be made available at
+https://github.com/surya1701/Factify-2.0.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Defactify2 @AAAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMICL: Empowering Vision-language Model with Multi-Modal In-Context
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Zhao, Zefan Cai, Shuzheng Si, Xiaojian Ma, Kaikai An, Liang Chen, Zixuan Liu, Sheng Wang, Wenjuan Han, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the resurgence of deep learning, vision-language models (VLMs) enhanced
+by large language models (LLMs) have grown exponentially in popularity.
+However, while LLMs can utilize extensive background knowledge and task
+information with in-context learning, most VLMs still struggle with
+understanding complex multi-modal prompts with multiple images, making VLMs
+less effective in downstream vision-language tasks. In this paper, we address
+the limitation above by 1) introducing MMICL, a new approach to allow the VLM
+to deal with multi-modal inputs efficiently; 2) proposing a novel context
+scheme to augment the in-context learning ability of the VLM; 3) constructing
+the Multi-modal In-Context Learning (MIC) dataset, designed to enhance the
+VLM's ability to understand complex multi-modal prompts. Our experiments
+confirm that MMICL achieves new state-of-the-art zero-shot performance on a
+wide range of general vision-language tasks, especially for complex benchmarks,
+including MME and MMBench. Our analysis demonstrates that MMICL effectively
+tackles the challenge of complex multi-modal prompt understanding and emerges
+the impressive ICL ability. Furthermore, we observe that MMICL successfully
+alleviates language bias in VLMs, a common issue for VLMs that often leads to
+hallucination when faced with extensive textual context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, dataset, checkpoints, and demos are available at
+  https://github.com/PKUnlp-icler/MIC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memotion 3: <span class="highlight-title">Dataset</span> on Sentiment and Emotion Analysis of Codemixed
+  Hindi-English Memes <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09892v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09892v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shreyash Mishra, S Suryavardan, Parth Patwa, Megha Chakraborty, Anku Rani, Aishwarya Reganti, Aman Chadha, Amitava Das, Amit Sheth, Manoj Chinnakotla, Asif Ekbal, Srijan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Memes are the new-age conveyance mechanism for humor on social media sites.
+Memes often include an image and some text. Memes can be used to promote
+disinformation or hatred, thus it is crucial to investigate in details. We
+introduce Memotion 3, a new dataset with 10,000 annotated memes. Unlike other
+prevalent datasets in the domain, including prior iterations of Memotion,
+Memotion 3 introduces Hindi-English Codemixed memes while prior works in the
+area were limited to only the English memes. We describe the Memotion task, the
+data collection and the dataset creation methodologies. We also provide a
+baseline for the task. The baseline code and dataset will be made available at
+https://github.com/Shreyashm16/Memotion-3.0
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Defactify2 @AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evo<span class="highlight-title">Prompt</span>ing: Language Models for Code-Level Neural Architecture Search <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelica Chen, David M. Dohan, David R. So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the recent impressive accomplishments of language models (LMs) for code
+generation, we explore the use of LMs as adaptive mutation and crossover
+operators for an evolutionary neural architecture search (NAS) algorithm. While
+NAS still proves too difficult a task for LMs to succeed at solely through
+prompting, we find that the combination of evolutionary prompt engineering with
+soft prompt-tuning, a method we term EvoPrompting, consistently finds diverse
+and high performing models. We first demonstrate that EvoPrompting is effective
+on the computationally efficient MNIST-1D dataset, where EvoPrompting produces
+convolutional architecture variants that outperform both those designed by
+human experts and naive few-shot prompting in terms of accuracy and model size.
+We then apply our method to searching for graph neural networks on the CLRS
+Algorithmic Reasoning Benchmark, where EvoPrompting is able to design novel
+architectures that outperform current state-of-the-art models on 21 out of 30
+algorithmic reasoning tasks while maintaining similar model size. EvoPrompting
+is successful at designing accurate and efficient neural network architectures
+across a variety of machine learning tasks, while also being general enough for
+easy adaptation to other tasks beyond neural network design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Passive learning of active causal strategies in agents and language
+  models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16183v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16183v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Kyle Lampinen, Stephanie C Y Chan, Ishita Dasgupta, Andrew J Nam, Jane X Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What can be learned about causality and experimentation from passive data?
+This question is salient given recent successes of passively-trained language
+models in interactive domains such as tool use. Passive learning is inherently
+limited. However, we show that purely passive learning can in fact allow an
+agent to learn generalizable strategies for determining and using causal
+structures, as long as the agent can intervene at test time. We formally
+illustrate that learning a strategy of first experimenting, then seeking goals,
+can allow generalization from passive learning in principle. We then show
+empirically that agents trained via imitation on expert data can indeed
+generalize at test time to infer and use causal links which are never present
+in the training data; these agents can also generalize experimentation
+strategies to novel variable sets never observed in training. We then show that
+strategies for causal intervention and exploitation can be generalized from
+passive data even in a more complex environment with high-dimensional
+observations, with the support of natural language explanations. Explanations
+can even allow passive learners to generalize out-of-distribution from
+perfectly-confounded training data. Finally, we show that language models,
+trained only on passive next-word prediction, can generalize causal
+intervention strategies from a few-shot prompt containing examples of
+experimentation, together with explanations and reasoning. These results
+highlight the surprising power of passive learning of active causal strategies,
+and may help to understand the behaviors and capabilities of language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Advances in Neural Information Processing Systems (NeurIPS 2023). 10
+  pages main text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ mBLIP: Efficient Bootstrapping of Multilingual Vision-LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregor Geigle, Abhay Jain, Radu Timofte, Goran Glavaš
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modular vision-language models (Vision-LLMs) align pretrained image encoders
+with frozen large language models (LLMs), representing a computationally much
+more efficient alternative to end-to-end training of large vision-language
+models from scratch, which is prohibitively expensive for most researchers and
+practitioners. Vision-LLMs instead post-hoc condition LLMs to `understand' the
+output of an image encoder. With the abundance of readily available
+high-quality English image-text data as well as monolingual English LLMs, the
+research focus has been on English-only Vision-LLMs. Multilingual
+vision-language models are still predominantly obtained via expensive
+end-to-end pretraining, resulting in comparatively smaller models, trained on
+limited multilingual image data supplemented with text-only multilingual
+corpora. In this work, we present mBLIP, the first multilingual Vision-LLM,
+which we obtain in a computationally efficient manner -- on consumer hardware
+and using only a few million training examples -- by leveraging a pretrained
+multilingual LLM. To this end, we \textit{re-align} an image encoder previously
+tuned to an English LLM to a new, multilingual LLM -- for this, we leverage
+multilingual data from a mix of vision-and-language tasks, which we obtain by
+machine-translating high-quality English data to 95 languages. On the IGLUE
+benchmark, mBLIP yields results competitive with state-of-the-art models.
+Moreover, in image captioning on XM3600, mBLIP (zero-shot) even outperforms
+PaLI-X (a model with 55B parameters). Compared to these very large multilingual
+vision-language models trained from scratch, we obtain mBLIP by training orders
+of magnitude fewer parameters on magnitudes less data. We release our model and
+code at \url{https://github.com/gregor-ge/mBLIP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lyra: Orchestrating Dual Correction in Automated Theorem Proving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15806v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15806v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanyang Zheng, Haiming Wang, Enze Xie, Zhengying Liu, Jiankai Sun, Huajian Xin, Jianhao Shen, Zhenguo Li, Yu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) present an intriguing avenue for exploration in
+the field of formal theorem proving. Nevertheless, their full potential,
+particularly concerning the mitigation of hallucinations and refinement through
+prover error messages, remains an area that has yet to be thoroughly
+investigated. To enhance the effectiveness of LLMs in the field, we introduce
+the Lyra, a new framework that employs two distinct correction mechanisms: Tool
+Correction (TC) and Conjecture Correction (CC). To implement Tool Correction in
+the post-processing of formal proofs, we leverage prior knowledge to utilize
+predefined prover tools (e.g., Sledgehammer) for guiding the replacement of
+incorrect tools. Tool Correction significantly contributes to mitigating
+hallucinations, thereby improving the overall accuracy of the proof. In
+addition, we introduce Conjecture Correction, an error feedback mechanism
+designed to interact with prover to refine formal proof conjectures with prover
+error messages. Compared to the previous refinement framework, the proposed
+Conjecture Correction refines generation with instruction but does not collect
+paired (generation, error & refinement) prompts. Our method has achieved
+state-of-the-art (SOTA) performance on both miniF2F validation (48.0% -> 55.3%)
+and test (45.5% -> 51.2%). We also present 3 IMO problems solved by Lyra. We
+believe Tool Correction (post-process for hallucination mitigation) and
+Conjecture Correction (subgoal adjustment from interaction with environment)
+could provide a promising avenue for future research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ L2CEval: Evaluating Language-to-Code Generation Capabilities of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansong Ni, Pengcheng Yin, Yilun Zhao, Martin Riddell, Troy Feng, Rui Shen, Stephen Yin, Ye Liu, Semih Yavuz, Caiming Xiong, Shafiq Joty, Yingbo Zhou, Dragomir Radev, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large language models (LLMs), especially those that are pretrained
+on code, have demonstrated strong capabilities in generating programs from
+natural language inputs in a few-shot or even zero-shot manner. Despite
+promising results, there is a notable lack of a comprehensive evaluation of
+these models language-to-code generation capabilities. Existing studies often
+focus on specific tasks, model architectures, or learning paradigms, leading to
+a fragmented understanding of the overall landscape. In this work, we present
+L2CEval, a systematic evaluation of the language-to-code generation
+capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing,
+math reasoning and Python programming, analyzing the factors that potentially
+affect their performance, such as model size, pretraining data, instruction
+tuning, and different prompting methods. In addition to assessing model
+performance, we measure confidence calibration for the models and conduct human
+evaluations of the output programs. This enables us to identify and analyze the
+typical failure modes across various tasks and models. L2CEval offers a
+comprehensive understanding of the capabilities and limitations of LLMs in
+language-to-code generation. We also release the evaluation framework and all
+model outputs, hoping to lay the groundwork for further future research in this
+domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://l2c-eval.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-Level Serialized Output Training for Joint Streaming ASR and ST
+  Leveraging Textual Alignments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03354v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03354v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Papi, Peidong Wang, Junkun Chen, Jian Xue, Jinyu Li, Yashesh Gaur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world applications, users often require both translations and
+transcriptions of speech to enhance their comprehension, particularly in
+streaming scenarios where incremental generation is necessary. This paper
+introduces a streaming Transformer-Transducer that jointly generates automatic
+speech recognition (ASR) and speech translation (ST) outputs using a single
+decoder. To produce ASR and ST content effectively with minimal latency, we
+propose a joint token-level serialized output training method that interleaves
+source and target words by leveraging an off-the-shelf textual aligner.
+Experiments in monolingual (it-en) and multilingual (\{de,es,it\}-en) settings
+demonstrate that our approach achieves the best quality-latency balance. With
+an average ASR latency of 1s and ST latency of 1.3s, our model shows no
+degradation or even improves output quality compared to separate ASR and ST
+models, yielding an average improvement of 1.1 WER and 0.4 BLEU in the
+multilingual case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Demystifying CLIP Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16671v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16671v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer, Christoph Feichtenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) is an approach that has
+advanced research and applications in computer vision, fueling modern
+recognition systems and generative models. We believe that the main ingredient
+to the success of CLIP is its data and not the model architecture or
+pre-training objective. However, CLIP only provides very limited information
+about its data and how it has been collected, leading to works that aim to
+reproduce CLIP's data by filtering with its model parameters. In this work, we
+intend to reveal CLIP's data curation approach and in our pursuit of making it
+open to the community introduce Metadata-Curated Language-Image Pre-training
+(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's
+concepts) and yields a balanced subset over the metadata distribution. Our
+experimental study rigorously isolates the model and training settings,
+concentrating solely on data. MetaCLIP applied to CommonCrawl with 400M
+image-text data pairs outperforms CLIP's data on multiple standard benchmarks.
+In zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,
+surpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining
+the same training budget, attains 72.4%. Our observations hold across various
+model sizes, exemplified by ViT-H achieving 80.5%, without any
+bells-and-whistles. Curation code and training data distribution on metadata is
+made available at https://github.com/facebookresearch/MetaCLIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by
+  other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On decoder-only architecture for speech-to-text and large language model
+  integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03917v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03917v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Wu, Yashesh Gaur, Zhuo Chen, Long Zhou, Yimeng Zhu, Tianrui Wang, Jinyu Li, Shujie Liu, Bo Ren, Linquan Liu, Yu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable success in the field of
+natural language processing, enabling better human-computer interaction using
+natural language. However, the seamless integration of speech signals into LLMs
+has not been explored well. The "decoder-only" architecture has also not been
+well studied for speech processing tasks. In this research, we introduce
+Speech-LLaMA, a novel approach that effectively incorporates acoustic
+information into text-based large language models. Our method leverages
+Connectionist Temporal Classification and a simple audio encoder to map the
+compressed acoustic features to the continuous semantic space of the LLM. In
+addition, we further probe the decoder-only architecture for speech-to-text
+tasks by training a smaller scale randomly initialized speech-LLaMA model from
+speech-text paired data alone. We conduct experiments on multilingual
+speech-to-text translation tasks and demonstrate a significant improvement over
+strong baselines, highlighting the potential advantages of decoder-only models
+for speech-to-text conversion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Textbooks Are All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee, Yuanzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce phi-1, a new large language model for code, with significantly
+smaller size than competing models: phi-1 is a Transformer-based model with
+1.3B parameters, trained for 4 days on 8 A100s, using a selection of ``textbook
+quality" data from the web (6B tokens) and synthetically generated textbooks
+and exercises with GPT-3.5 (1B tokens). Despite this small scale, phi-1 attains
+pass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP. It also displays
+surprising emergent properties compared to phi-1-base, our model before our
+finetuning stage on a dataset of coding exercises, and phi-1-small, a smaller
+model with 350M parameters trained with the same pipeline as phi-1 that still
+achieves 45% on HumanEval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages; changed color scheme of plot. fixed minor typos and added
+  couple clarifications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LatticeGen: A Cooperative Framework which Hides Generated Text in a
+  Lattice for Privacy-Aware Generation on Cloud 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17157v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17157v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengke Zhang, Tianxing He, Tianle Wang, Lu Mi, Fatemehsadat Mireshghallah, Binyi Chen, Hao Wang, Yulia Tsvetkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the current user-server interaction paradigm of prompted generation with
+large language models (LLM) on cloud, the server fully controls the generation
+process, which leaves zero options for users who want to keep the generated
+text to themselves. We propose LatticeGen, a cooperative framework in which the
+server still handles most of the computation while the user controls the
+sampling operation. The key idea is that the true generated sequence is mixed
+with noise tokens by the user and hidden in a noised lattice. Considering
+potential attacks from a hypothetically malicious server and how the user can
+defend against it, we propose the repeated beam-search attack and the mixing
+noise scheme. In our experiments we apply LatticeGen to protect both prompt and
+generation. It is shown that while the noised lattice degrades generation
+quality, LatticeGen successfully protects the true generation to a remarkable
+degree under strong attacks (more than 50% of the semantic remains hidden as
+measured by BERTScore).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MeetEval: A Toolkit for Computation of Word Error Rates for Meeting
+  Transcription Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilo von Neumann, Christoph Boeddeker, Marc Delcroix, Reinhold Haeb-Umbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  MeetEval is an open-source toolkit to evaluate all kinds of meeting
+transcription systems. It provides a unified interface for the computation of
+commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER
+along other WER definitions. We extend the cpWER computation by a temporal
+constraint to ensure that only words are identified as correct when the
+temporal alignment is plausible. This leads to a better quality of the matching
+of the hypothesis string to the reference string that more closely resembles
+the actual transcription quality, and a system is penalized if it provides poor
+time annotations. Since word-level timing information is often not available,
+we present a way to approximate exact word-level timings from segment-level
+timings (e.g., a sentence) and show that the approximation leads to a similar
+WER as a matching with exact word-level annotations. At the same time, the time
+constraint leads to a speedup of the matching algorithm, which outweighs the
+additional overhead caused by processing the time stamps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the CHiME7 workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RCOT: Detecting and Rectifying Factual Inconsistency in Reasoning by
+  Reversing Chain-of-Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11499v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11499v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianci Xue, Ziqi Wang, Zhenhailong Wang, Chi Han, Pengfei Yu, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language Models (LLMs) have achieved promising performance on
+arithmetic reasoning tasks by incorporating step-by-step chain-of-thought (CoT)
+prompting. However, LLMs face challenges in maintaining factual consistency
+during reasoning, exhibiting tendencies to condition overlooking, question
+misinterpretation, and condition hallucination over given problems. Existing
+methods use coarse-grained feedback (e.g., whether the answer is correct) to
+improve factual consistency. In this work, we propose RCoT (Reversing
+Chain-of-Thought), a novel method to improve LLMs' reasoning abilities by
+automatically detecting and rectifying factual inconsistency in LLMs, generated
+solutions. To detect factual inconsistency, RCoT first asks LLMs to reconstruct
+the problem based on generated solutions. Then fine-grained comparisons between
+the original problem and the reconstructed problem expose the factual
+inconsistency in the original solutions. To rectify the solution, RCoT
+formulates detected factual inconsistency into fine-grained feedback to guide
+LLMs in revising solutions. Experimental results demonstrate improvements of
+RCoT over standard CoT, Self-Consistency and Self-Refine across seven
+arithmetic datasets. Moreover, we find that manually written fine-grained
+feedback can dramatically improve LLMs' reasoning abilities (e.g., ChatGPT
+reaches 94.6% accuracy on GSM8K), encouraging the community to further explore
+the fine-grained feedback generation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparkles: Unlocking Chats Across Multiple Images for Multimodal
+  Instruction-Following Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupan Huang, Zaiqiao Meng, Fangyu Liu, Yixuan Su, Nigel Collier, Yutong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models exhibit enhanced zero-shot performance on various tasks
+when fine-tuned with instruction-following data. Multimodal
+instruction-following models extend these capabilities by integrating both text
+and images. However, existing models such as MiniGPT-4 face challenges in
+maintaining dialogue coherence in scenarios involving multiple images. A
+primary reason is the lack of a specialized dataset for this critical
+application. To bridge these gaps, we present SparklesChat, a multimodal
+instruction-following model for open-ended dialogues across multiple images. To
+support the training, we introduce SparklesDialogue, the first
+machine-generated dialogue dataset tailored for word-level interleaved
+multi-image and text interactions. Furthermore, we construct SparklesEval, a
+GPT-assisted benchmark for quantitatively assessing a model's conversational
+competence across multiple images and dialogue turns. Our experiments validate
+the effectiveness of SparklesChat in understanding and reasoning across
+multiple images and dialogue turns. Specifically, SparklesChat outperformed
+MiniGPT-4 on established vision-and-language benchmarks, including the BISON
+binary image selection task and the NLVR2 visual reasoning task. Moreover,
+SparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding
+MiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative
+evaluations further demonstrate SparklesChat's generality in handling
+real-world applications. All resources are available at
+https://github.com/HYPJUDY/Sparkles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Reduced main content to 9 pages; typos corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-grounded Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Lian, Baifeng Shi, Adam Yala, Trevor Darrell, Boyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-conditioned diffusion models have emerged as a promising tool for neural
+video generation. However, current models still struggle with intricate
+spatiotemporal prompts and often generate restricted or incorrect motion (e.g.,
+even lacking the ability to be prompted for objects moving from left to right).
+To address these limitations, we introduce LLM-grounded Video Diffusion (LVD).
+Instead of directly generating videos from the text inputs, LVD first leverages
+a large language model (LLM) to generate dynamic scene layouts based on the
+text inputs and subsequently uses the generated layouts to guide a diffusion
+model for video generation. We show that LLMs are able to understand complex
+spatiotemporal dynamics from text alone and generate layouts that align closely
+with both the prompts and the object motion patterns typically observed in the
+real world. We then propose to guide video diffusion models with these layouts
+by adjusting the attention maps. Our approach is training-free and can be
+integrated into any video diffusion model that admits classifier guidance. Our
+results demonstrate that LVD significantly outperforms its base video diffusion
+model and several strong baseline methods in faithfully generating videos with
+the desired attributes and motion patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://llm-grounded-video-diffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reproducing Whisper-Style Training Using an Open-Source Toolkit and
+  Publicly Available Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13876v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13876v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Peng, Jinchuan Tian, Brian Yan, Dan Berrebbi, Xuankai Chang, Xinjian Li, Jiatong Shi, Siddhant Arora, William Chen, Roshan Sharma, Wangyou Zhang, Yui Sudo, Muhammad Shakeel, Jee-weon Jung, Soumi Maiti, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training speech models on large volumes of data has achieved remarkable
+success. OpenAI Whisper is a multilingual multitask model trained on 680k hours
+of supervised speech data. It generalizes well to various speech recognition
+and translation benchmarks even in a zero-shot setup. However, the full
+pipeline for developing such models (from data collection to training) is not
+publicly accessible, which makes it difficult for researchers to further
+improve its performance and address training-related issues such as efficiency,
+robustness, fairness, and bias. This work presents an Open Whisper-style Speech
+Model (OWSM), which reproduces Whisper-style training using an open-source
+toolkit and publicly available data. OWSM even supports more translation
+directions and can be more efficient to train. We will publicly release all
+scripts used for data preparation, training, inference, and scoring as well as
+pre-trained models and training logs to promote open science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-context Autoencoder for Context Compression in a Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Ge, Jing Hu, Lei Wang, Xun Wang, Si-Qing Chen, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the In-context Autoencoder (ICAE), leveraging the power of a large
+language models (LLM) to compress a long context into short compact memory
+slots that can be directly conditioned on by the LLM for various purposes. ICAE
+is first pretrained using both autoencoding and language modeling objectives on
+massive text data, enabling it to generate memory slots that accurately and
+comprehensively represent the original context; Then, it is fine-tuned on
+instruction data for producing desirable responses to various prompts.
+Experiments demonstrate that our lightweight ICAE, introducing fewer than 1%
+additional parameters, effectively achieves 4X context compression based on
+Llama, offering advantages in both improved latency and GPU memory cost during
+inference, and showing an interesting insight in memorization as well as
+potential for scalability. These promising results imply a novel perspective on
+the connection between working memory in cognitive science and representation
+learning in LLMs, revealing ICAE's significant implications in addressing the
+long context problem and suggesting further research in LLM context management.
+Our data, code and model are released at https://github.com/getao/icae.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2 (19 pages) with the code, data and model released</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Possibilities of AI-Generated Text Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04736v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04736v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Souradip Chakraborty, Amrit Singh Bedi, Sicheng Zhu, Bang An, Dinesh Manocha, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our work addresses the critical issue of distinguishing text generated by
+Large Language Models (LLMs) from human-produced text, a task essential for
+numerous applications. Despite ongoing debate about the feasibility of such
+differentiation, we present evidence supporting its consistent achievability,
+except when human and machine text distributions are indistinguishable across
+their entire support. Drawing from information theory, we argue that as
+machine-generated text approximates human-like quality, the sample size needed
+for detection increases. We establish precise sample complexity bounds for
+detecting AI-generated text, laying groundwork for future research aimed at
+developing advanced, multi-sample detectors. Our empirical evaluations across
+multiple datasets (Xsum, Squad, IMDb, and Kaggle FakeNews) confirm the
+viability of enhanced detection methods. We test various state-of-the-art text
+generators, including GPT-2, GPT-3.5-Turbo, Llama, Llama-2-13B-Chat-HF, and
+Llama-2-70B-Chat-HF, against detectors, including oBERTa-Large/Base-Detector,
+GPTZero. Our findings align with OpenAI's empirical data related to sequence
+length, marking the first theoretical substantiation for these observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Query Rewriting for Effective Misinformation Discovery <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashkan Kazemi, Artem Abzaliev, Naihao Deng, Rui Hou, Scott A. Hale, Verónica Pérez-Rosas, Rada Mihalcea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel system to help fact-checkers formulate search queries for
+known misinformation claims and effectively search across multiple social media
+platforms. We introduce an adaptable rewriting strategy, where editing actions
+for queries containing claims (e.g., swap a word with its synonym; change verb
+tense into present simple) are automatically learned through offline
+reinforcement learning. Our model uses a decision transformer to learn a
+sequence of editing actions that maximizes query retrieval metrics such as mean
+average precision. We conduct a series of experiments showing that our query
+rewriting system achieves a relative increase in the effectiveness of the
+queries of up to 42%, while producing editing action sequences that are human
+interpretable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AACL 2023 (long paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multiple evolutionary pressures shape identical consonant avoidance in
+  the world's languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chundra A. Cathcart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Languages disfavor word forms containing sequences of similar or identical
+consonants, due to the biomechanical and cognitive difficulties posed by
+patterns of this sort. However, the specific evolutionary processes responsible
+for this phenomenon are not fully understood. Words containing sequences of
+identical consonants may be more likely to arise than those without; processes
+of word form mutation may be more likely to remove than create sequences of
+identical consonants in word forms; finally, words containing identical
+consonants may die out more frequently than those without. Phylogenetic
+analyses of the evolution of homologous word forms indicate that words with
+identical consonants arise less frequently than those without, and processes
+which mutate word forms are more likely to remove sequences of identical
+consonants than introduce them. However, words with identical consonants do not
+die out more frequently than those without. Further analyses reveal that forms
+with identical consonants are replaced in basic meaning functions more
+frequently than words without. Taken together, results suggest that the under
+representation of sequences of identical consonants is overwhelmingly a
+byproduct of constraints on word form coinage, though processes related to word
+usage also serve to ensure that such patterns are infrequent in more salient
+vocabulary items. These findings clarify previously unknown aspects of
+processes of lexical evolution and competition that take place during language
+change, optimizing communicative systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pp</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">78</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transcending Domains through Text-to-Image Diffusion: A Source-Free
+  Approach to Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivang Chopra, Suraj Kothawade, Houda Aynaou, Aman Chadha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain Adaptation (DA) is a method for enhancing a model's performance on a
+target domain with inadequate annotated data by applying the information the
+model has acquired from a related source domain with sufficient labeled data.
+The escalating enforcement of data-privacy regulations like HIPAA, COPPA,
+FERPA, etc. have sparked a heightened interest in adapting models to novel
+domains while circumventing the need for direct access to the source data, a
+problem known as Source-Free Domain Adaptation (SFDA). In this paper, we
+propose a novel framework for SFDA that generates source data using a
+text-to-image diffusion model trained on the target domain samples. Our method
+starts by training a text-to-image diffusion model on the labeled target domain
+samples, which is then fine-tuned using the pre-trained source model to
+generate samples close to the source data. Finally, we use Domain Adaptation
+techniques to align the artificially generated source data with the target
+domain data, resulting in significant performance improvements of the model on
+the target domain. Through extensive comparison against several baselines on
+the standard Office-31, Office-Home, and VisDA benchmarks, we demonstrate the
+effectiveness of our approach for the SFDA task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keypoint-Augmented <span class="highlight-title">Self-Supervised</span> Learning for Medical Image
+  Segmentation with Limited Annotation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangsihao Yang, Mengwei Ren, Kaize Ding, Guido Gerig, Yalin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining CNN models (i.e., UNet) through self-supervision has become a
+powerful approach to facilitate medical image segmentation under low annotation
+regimes. Recent contrastive learning methods encourage similar global
+representations when the same image undergoes different transformations, or
+enforce invariance across different image/patch features that are intrinsically
+correlated. However, CNN-extracted global and local features are limited in
+capturing long-range spatial dependencies that are essential in biological
+anatomy. To this end, we present a keypoint-augmented fusion layer that
+extracts representations preserving both short- and long-range self-attention.
+In particular, we augment the CNN feature map at multiple scales by
+incorporating an additional input that learns long-range spatial self-attention
+among localized keypoint features. Further, we introduce both global and local
+self-supervised pretraining for the framework. At the global scale, we obtain
+global representations from both the bottleneck of the UNet, and by aggregating
+multiscale keypoint features. These global features are subsequently
+regularized through image-level contrastive objectives. At the local scale, we
+define a distance-based criterion to first establish correspondences among
+keypoints and encourage similarity between their features. Through extensive
+experiments on both MRI and CT segmentation tasks, we demonstrate the
+architectural advantages of our proposed method in comparison to both CNN and
+Transformer-based UNets, when all architectures are trained with randomly
+initialized weights. With our proposed pretraining strategy, our method further
+outperforms existing SSL methods by producing more robust self-attention and
+achieving state-of-the-art segmentation results. The code is available at
+https://github.com/zshyang/kaf.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready for NeurIPS 2023. Code available at
+  https://github.com/zshyang/kaf.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STARS: Zero-shot Sim-to-Real Transfer for Segmentation of Shipwrecks in
+  Sonar Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Advaith Venkatramanan Sethuraman, Katherine A. Skinner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the problem of sim-to-real transfer for object
+segmentation when there is no access to real examples of an object of interest
+during training, i.e. zero-shot sim-to-real transfer for segmentation. We focus
+on the application of shipwreck segmentation in side scan sonar imagery. Our
+novel segmentation network, STARS, addresses this challenge by fusing a
+predicted deformation field and anomaly volume, allowing it to generalize
+better to real sonar images and achieve more effective zero-shot sim-to-real
+transfer for image segmentation. We evaluate the sim-to-real transfer
+capabilities of our method on a real, expert-labeled side scan sonar dataset of
+shipwrecks collected from field work surveys with an autonomous underwater
+vehicle (AUV). STARS is trained entirely in simulation and performs zero-shot
+shipwreck segmentation with no additional fine-tuning on real data. Our method
+provides a significant 20% increase in segmentation performance for the
+targeted shipwreck class compared to the best baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task-guided Domain Gap Reduction for Monocular Depth Prediction in
+  Endoscopy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anita Rau, Binod Bhattarai, Lourdes Agapito, Danail Stoyanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colorectal cancer remains one of the deadliest cancers in the world. In
+recent years computer-aided methods have aimed to enhance cancer screening and
+improve the quality and availability of colonoscopies by automatizing
+sub-tasks. One such task is predicting depth from monocular video frames, which
+can assist endoscopic navigation. As ground truth depth from standard in-vivo
+colonoscopy remains unobtainable due to hardware constraints, two approaches
+have aimed to circumvent the need for real training data: supervised methods
+trained on labeled synthetic data and self-supervised models trained on
+unlabeled real data. However, self-supervised methods depend on unreliable loss
+functions that struggle with edges, self-occlusion, and lighting inconsistency.
+Methods trained on synthetic data can provide accurate depth for synthetic
+geometries but do not use any geometric supervisory signal from real data and
+overfit to synthetic anatomies and properties. This work proposes a novel
+approach to leverage labeled synthetic and unlabeled real data. While previous
+domain adaptation methods indiscriminately enforce the distributions of both
+input data modalities to coincide, we focus on the end task, depth prediction,
+and translate only essential information between the input domains. Our
+approach results in more resilient and accurate depth maps of real colonoscopy
+sequences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First Data Engineering in Medical Imaging Workshop at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SYRAC: Synthesize, Rank, and Count 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adriano D'Alessandro, Ali Mahdavi-Amiri, Ghassan Hamarneh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crowd counting is a critical task in computer vision, with several important
+applications. However, existing counting methods rely on labor-intensive
+density map annotations, necessitating the manual localization of each
+individual pedestrian. While recent efforts have attempted to alleviate the
+annotation burden through weakly or semi-supervised learning, these approaches
+fall short of significantly reducing the workload. We propose a novel approach
+to eliminate the annotation burden by leveraging latent diffusion models to
+generate synthetic data. However, these models struggle to reliably understand
+object quantities, leading to noisy annotations when prompted to produce images
+with a specific quantity of objects. To address this, we use latent diffusion
+models to create two types of synthetic data: one by removing pedestrians from
+real images, which generates ranked image pairs with a weak but reliable object
+quantity signal, and the other by generating synthetic images with a
+predetermined number of objects, offering a strong but noisy counting signal.
+Our method utilizes the ranking image pairs for pre-training and then fits a
+linear layer to the noisy synthetic images using these crowd quantity features.
+We report state-of-the-art results for unsupervised crowd counting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ It's all about you: Personalized in-Vehicle Gesture Recognition with a
+  Time-of-Flight Camera 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amr Gomaa, Guillermo Reyes, Michael Feld
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advances in gesture recognition technology, recognizing
+gestures in a driving environment remains challenging due to limited and costly
+data and its dynamic, ever-changing nature. In this work, we propose a
+model-adaptation approach to personalize the training of a CNNLSTM model and
+improve recognition accuracy while reducing data requirements. Our approach
+contributes to the field of dynamic hand gesture recognition while driving by
+providing a more efficient and accurate method that can be customized for
+individual users, ultimately enhancing the safety and convenience of in-vehicle
+interactions, as well as driver's experience and system trust. We incorporate
+hardware enhancement using a time-of-flight camera and algorithmic enhancement
+through data augmentation, personalized adaptation, and incremental learning
+techniques. We evaluate the performance of our approach in terms of recognition
+accuracy, achieving up to 90\%, and show the effectiveness of personalized
+adaptation and incremental learning for a user-centered design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AutoUI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ You Only Look at Once for Real-time and Generic Multi-Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayuan Wang, Q. M. Jonathan Wu, Ning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High precision, lightweight, and real-time responsiveness are three essential
+requirements for implementing autonomous driving. Considering all of them
+simultaneously is a challenge. In this study, we present an adaptive,
+real-time, and lightweight multi-task model designed to concurrently handle
+object detection, drivable area segmentation, and lane detection tasks. To
+achieve this research objective, we developed an end-to-end multi-task model
+with a unified and streamlined segmentation structure. Our model operates
+without the need for any specific customization structure or loss function. We
+achieved competitive results on the BDD100k dataset, particularly in
+visualization outcomes. The performance results show a mAP50 of 81.1% for
+object detection, a mIoU of 91.0% for drivable area segmentation, and an IoU of
+28.8% for lane line segmentation. Additionally, we introduced a real-road
+dataset to evaluate our model's performance in a real scene, which
+significantly outperforms competitors. This demonstrates that our model not
+only exhibits competitive performance but is also more flexible and faster than
+existing multi-task models. The source codes and pre-trained models are
+released at https://github.com/JiayuanWang-JW/YOLOv8-multi-task
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Visual Scene Understanding: Incremental Scene Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naitik Khandelwal, Xiao Liu, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene graph generation (SGG) involves analyzing images to extract meaningful
+information about objects and their relationships. Given the dynamic nature of
+the visual world, it becomes crucial for AI systems to detect new objects and
+establish their new relationships with existing objects. To address the lack of
+continual learning methodologies in SGG, we introduce the comprehensive
+Continual ScenE Graph Generation (CSEGG) dataset along with 3 learning
+scenarios and 8 evaluation metrics. Our research investigates the continual
+learning performances of existing SGG methods on the retention of previous
+object entities and relationships as they learn new ones. Moreover, we also
+explore how continual object detection enhances generalization in classifying
+known relationships on unknown objects. We conduct extensive experiments
+benchmarking and analyzing the classical two-stage SGG methods and the most
+recent transformer-based SGG methods in continual learning settings, and gain
+valuable insights into the CSEGG problem. We invite the research community to
+explore this emerging field of study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Spatio-Temporal Summarization using Information Based Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Humayra Tasnim, Soumya Dutta, Melanie Moses
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of burgeoning data generation, managing and storing large-scale
+time-varying datasets poses significant challenges. With the rise of
+supercomputing capabilities, the volume of data produced has soared,
+intensifying storage and I/O overheads. To address this issue, we propose a
+dynamic spatio-temporal data summarization technique that identifies
+informative features in key timesteps and fuses less informative ones. This
+approach minimizes storage requirements while preserving data dynamics. Unlike
+existing methods, our method retains both raw and summarized timesteps,
+ensuring a comprehensive view of information changes over time. We utilize
+information-theoretic measures to guide the fusion process, resulting in a
+visual representation that captures essential data patterns. We demonstrate the
+versatility of our technique across diverse datasets, encompassing
+particle-based flow simulations, security and surveillance applications, and
+biological cell interactions within the immune system. Our research
+significantly contributes to the realm of data management, introducing enhanced
+efficiency and deeper insights across diverse multidisciplinary domains. We
+provide a streamlined approach for handling massive datasets that can be
+applied to in situ analysis as well as post hoc analysis. This not only
+addresses the escalating challenges of data storage and I/O overheads but also
+unlocks the potential for informed decision-making. Our method empowers
+researchers and experts to explore essential temporal dynamics while minimizing
+storage requirements, thereby fostering a more effective and intuitive
+understanding of complex data behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ImagenHub: Standardizing the evaluation of conditional image generation
+  models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Tianle Li, Kai Zhang, Yujie Lu, Xingyu Fu, Wenwen Zhuang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a myriad of conditional image generation and editing models have
+been developed to serve different downstream tasks, including text-to-image
+generation, text-guided image editing, subject-driven image generation,
+control-guided image generation, etc. However, we observe huge inconsistencies
+in experimental conditions: datasets, inference, and evaluation metrics -
+render fair comparisons difficult. This paper proposes ImagenHub, which is a
+one-stop library to standardize the inference and evaluation of all the
+conditional image generation models. Firstly, we define seven prominent tasks
+and curate high-quality evaluation datasets for them. Secondly, we built a
+unified inference pipeline to ensure fair comparison. Thirdly, we design two
+human evaluation scores, i.e. Semantic Consistency and Perceptual Quality,
+along with comprehensive guidelines to evaluate generated images. We train
+expert raters to evaluate the model outputs based on the proposed metrics. Our
+human evaluation achieves a high inter-worker agreement of Krippendorff's alpha
+on 76% models with a value higher than 0.4. We comprehensively evaluated a
+total of around 30 models and observed three key takeaways: (1) the existing
+models' performance is generally unsatisfying except for Text-guided Image
+Generation and Subject-driven Image Generation, with 74% models achieving an
+overall score lower than 0.5. (2) we examined the claims from published papers
+and found 83% of them hold with a few exceptions. (3) None of the existing
+automatic metrics has a Spearman's correlation higher than 0.2 except
+subject-driven image generation. Moving forward, we will continue our efforts
+to evaluate newly published models and update our leaderboard to keep track of
+the progress in conditional image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RF-ULM: Deep Learning for Radio-Frequency Ultrasound Localization
+  Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Hahne, Georges Chabouh, Arthur Chavignon, Olivier Couture, Raphael Sznitman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Ultrasound Localization Microscopy (ULM), achieving high-resolution images
+relies on the precise localization of contrast agent particles across
+consecutive beamformed frames. However, our study uncovers an enormous
+potential: The process of delay-and-sum beamforming leads to an irreversible
+reduction of Radio-Frequency (RF) data, while its implications for localization
+remain largely unexplored. The rich contextual information embedded within RF
+wavefronts, including their hyperbolic shape and phase, offers great promise
+for guiding Deep Neural Networks (DNNs) in challenging localization scenarios.
+To fully exploit this data, we propose to directly localize scatterers in RF
+signals. Our approach involves a custom super-resolution DNN using learned
+feature channel shuffling and a novel semi-global convolutional sampling block
+tailored for reliable and accurate localization in RF input data. Additionally,
+we introduce a geometric point transformation that facilitates seamless mapping
+between B-mode and RF spaces. To validate the effectiveness of our method and
+understand the impact of beamforming, we conduct an extensive comparison with
+State-Of-The-Art (SOTA) techniques in ULM. We present the inaugural in vivo
+results from an RF-trained DNN, highlighting its real-world practicality. Our
+findings show that RF-ULM bridges the domain gap between synthetic and real
+datasets, offering a considerable advantage in terms of precision and
+complexity. To enable the broader research community to benefit from our
+findings, our code and the associated SOTA methods are made available at
+https://github.com/hahnec/rf-ulm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive DeepSSM: Training Methodology for Image-To-Shape Deep Models <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abu Zahid Bin Aziz, Jadie Adams, Shireen Elhabian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical shape modeling (SSM) is an enabling quantitative tool to study
+anatomical shapes in various medical applications. However, directly using 3D
+images in these applications still has a long way to go. Recent deep learning
+methods have paved the way for reducing the substantial preprocessing steps to
+construct SSMs directly from unsegmented images. Nevertheless, the performance
+of these models is not up to the mark. Inspired by multiscale/multiresolution
+learning, we propose a new training strategy, progressive DeepSSM, to train
+image-to-shape deep learning models. The training is performed in multiple
+scales, and each scale utilizes the output from the previous scale. This
+strategy enables the model to learn coarse shape features in the first scales
+and gradually learn detailed fine shape features in the later scales. We
+leverage shape priors via segmentation-guided multi-task learning and employ
+deep supervision loss to ensure learning at each scale. Experiments show the
+superiority of models trained by the proposed strategy from both quantitative
+and qualitative perspectives. This training methodology can be employed to
+improve the stability and accuracy of any deep learning method for inferring
+statistical representations of anatomies from medical images and can be adopted
+by existing deep learning methods to improve model accuracy and training
+stability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ShapeMI MICCAI 2023: Workshop on Shape in Medical Imaging</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fetal-BET: Brain Extraction Tool for Fetal MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Razieh Faghihpirayesh, Davood Karimi, Deniz Erdoğmuş, Ali Gholipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fetal brain extraction is a necessary first step in most computational fetal
+brain MRI pipelines. However, it has been a very challenging task due to
+non-standard fetal head pose, fetal movements during examination, and vastly
+heterogeneous appearance of the developing fetal brain and the neighboring
+fetal and maternal anatomy across various sequences and scanning conditions.
+Development of a machine learning method to effectively address this task
+requires a large and rich labeled dataset that has not been previously
+available. As a result, there is currently no method for accurate fetal brain
+extraction on various fetal MRI sequences. In this work, we first built a large
+annotated dataset of approximately 72,000 2D fetal brain MRI images. Our
+dataset covers the three common MRI sequences including T2-weighted,
+diffusion-weighted, and functional MRI acquired with different scanners.
+Moreover, it includes normal and pathological brains. Using this dataset, we
+developed and validated deep learning methods, by exploiting the power of the
+U-Net style architectures, the attention mechanism, multi-contrast feature
+learning, and data augmentation for fast, accurate, and generalizable automatic
+fetal brain extraction. Our approach leverages the rich information from
+multi-contrast (multi-sequence) fetal MRI data, enabling precise delineation of
+the fetal brain structures. Evaluations on independent test data show that our
+method achieves accurate brain extraction on heterogeneous test data acquired
+with different scanners, on pathological brains, and at various gestational
+stages. This robustness underscores the potential utility of our deep learning
+model for fetal brain imaging and image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, 2 TABLES, This work has been submitted to the
+  IEEE Transactions on Medical Imaging for possible publication. Copyright may
+  be transferred without notice, after which this version may no longer be
+  accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Direct Inversion: Boosting Diffusion-based Editing with 3 Lines of Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Ju, Ailing Zeng, Yuxuan Bian, Shaoteng Liu, Qiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-guided diffusion models have revolutionized image generation and
+editing, offering exceptional realism and diversity. Specifically, in the
+context of diffusion-based editing, where a source image is edited according to
+a target prompt, the process commences by acquiring a noisy latent vector
+corresponding to the source image via the diffusion model. This vector is
+subsequently fed into separate source and target diffusion branches for
+editing. The accuracy of this inversion process significantly impacts the final
+editing outcome, influencing both essential content preservation of the source
+image and edit fidelity according to the target prompt. Prior inversion
+techniques aimed at finding a unified solution in both the source and target
+diffusion branches. However, our theoretical and empirical analyses reveal that
+disentangling these branches leads to a distinct separation of responsibilities
+for preserving essential content and ensuring edit fidelity. Building on this
+insight, we introduce "Direct Inversion," a novel technique achieving optimal
+performance of both branches with just three lines of code. To assess image
+editing performance, we present PIE-Bench, an editing benchmark with 700 images
+showcasing diverse scenes and editing types, accompanied by versatile
+annotations and comprehensive evaluation metrics. Compared to state-of-the-art
+optimization-based inversion techniques, our solution not only yields superior
+performance across 8 editing methods but also achieves nearly an order of
+speed-up.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">GPT</span>-Driver: Learning to Drive with <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Mao, Yuxi Qian, Hang Zhao, Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple yet effective approach that can transform the OpenAI
+GPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion
+planning is a core challenge in autonomous driving, aiming to plan a driving
+trajectory that is safe and comfortable. Existing motion planners predominantly
+leverage heuristic methods to forecast driving trajectories, yet these
+approaches demonstrate insufficient generalization capabilities in the face of
+novel and unseen driving scenarios. In this paper, we propose a novel approach
+to motion planning that capitalizes on the strong reasoning capabilities and
+generalization potential inherent to Large Language Models (LLMs). The
+fundamental insight of our approach is the reformulation of motion planning as
+a language modeling problem, a perspective not previously explored.
+Specifically, we represent the planner inputs and outputs as language tokens,
+and leverage the LLM to generate driving trajectories through a language
+description of coordinate positions. Furthermore, we propose a novel
+prompting-reasoning-finetuning strategy to stimulate the numerical reasoning
+potential of the LLM. With this strategy, the LLM can describe highly precise
+trajectory coordinates and also its internal decision-making process in natural
+language. We evaluate our approach on the large-scale nuScenes dataset, and
+extensive experiments substantiate the effectiveness, generalization ability,
+and interpretability of our GPT-based motion planner. Code will be released
+upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A multi-institutional pediatric <span class="highlight-title">dataset</span> of clinical radiology MRIs by
+  the Children's Brain Tumor Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariana M. Familiar, Anahita Fathi Kazerooni, Hannah Anderson, Aliaksandr Lubneuski, Karthik Viswanathan, Rocky Breslow, Nastaran Khalili, Sina Bagheri, Debanjan Haldar, Meen Chul Kim, Sherjeel Arif, Rachel Madhogarhia, Thinh Q. Nguyen, Elizabeth A. Frenkel, Zeinab Helili, Jessica Harrison, Keyvan Farahani, Marius George Linguraru, Ulas Bagci, Yury Velichko, Jeffrey Stevens, Sarah Leary, Robert M. Lober, Stephani Campion, Amy A. Smith, Denise Morinigo, Brian Rood, Kimberly Diamond, Ian F. Pollack, Melissa Williams, Arastoo Vossough, Jeffrey B. Ware, Sabine Mueller, Phillip B. Storm, Allison P. Heath, Angela J. Waanders, Jena V. Lilly, Jennifer L. Mason, Adam C. Resnick, Ali Nabavizadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pediatric brain and spinal cancers remain the leading cause of cancer-related
+death in children. Advancements in clinical decision-support in pediatric
+neuro-oncology utilizing the wealth of radiology imaging data collected through
+standard care, however, has significantly lagged other domains. Such data is
+ripe for use with predictive analytics such as artificial intelligence (AI)
+methods, which require large datasets. To address this unmet need, we provide a
+multi-institutional, large-scale pediatric dataset of 23,101 multi-parametric
+MRI exams acquired through routine care for 1,526 brain tumor patients, as part
+of the Children's Brain Tumor Network. This includes longitudinal MRIs across
+various cancer diagnoses, with associated patient-level clinical information,
+digital pathology slides, as well as tissue genotype and omics data. To
+facilitate downstream analysis, treatment-na\"ive images for 370 subjects were
+processed and released through the NCI Childhood Cancer Data Initiative via the
+Cancer Data Service. Through ongoing efforts to continuously build these
+imaging repositories, our aim is to accelerate discovery and translational AI
+models with real-world data, to ultimately empower precision medicine for
+children.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drive<span class="highlight-title">GPT</span>4: Interpretable End-to-end Autonomous Driving via Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhua Xu, Yujia Zhang, Enze Xie, Zhen Zhao, Yong Guo, Kenneth K. Y. Wong, Zhenguo Li, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past decade, autonomous driving has experienced rapid development in
+both academia and industry. However, its limited interpretability remains a
+significant unsolved problem, severely hindering autonomous vehicle
+commercialization and further development. Previous approaches utilizing small
+language models have failed to address this issue due to their lack of
+flexibility, generalization ability, and robustness. Recently, multimodal large
+language models (LLMs) have gained considerable attention from the research
+community for their capability to process and reason non-text data (e.g.,
+images and videos) by text. In this paper, we present DriveGPT4, an
+interpretable end-to-end autonomous driving system utilizing LLMs. DriveGPT4 is
+capable of interpreting vehicle actions and providing corresponding reasoning,
+as well as answering diverse questions posed by human users for enhanced
+interaction. Additionally, DriveGPT4 predicts vehicle low-level control signals
+in an end-to-end fashion. These capabilities stem from a customized visual
+instruction tuning dataset specifically designed for autonomous driving. To the
+best of our knowledge, DriveGPT4 is the first work focusing on interpretable
+end-to-end autonomous driving. When evaluated on multiple tasks alongside
+conventional methods and video understanding LLMs, DriveGPT4 demonstrates
+superior qualitative and quantitative performance. Additionally, DriveGPT4 can
+be generalized in a zero-shot fashion to accommodate more unseen scenarios. The
+project page is available at https://tonyxuqaq.github.io/projects/DriveGPT4/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project page is available at
+  https://tonyxuqaq.github.io/projects/DriveGPT4/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LEAP: Liberate Sparse-view 3D Modeling from Camera Poses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanwen Jiang, Zhenyu Jiang, Yue Zhao, Qixing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Are camera poses necessary for multi-view 3D modeling? Existing approaches
+predominantly assume access to accurate camera poses. While this assumption
+might hold for dense views, accurately estimating camera poses for sparse views
+is often elusive. Our analysis reveals that noisy estimated poses lead to
+degraded performance for existing sparse-view 3D modeling methods. To address
+this issue, we present LEAP, a novel pose-free approach, therefore challenging
+the prevailing notion that camera poses are indispensable. LEAP discards
+pose-based operations and learns geometric knowledge from data. LEAP is
+equipped with a neural volume, which is shared across scenes and is
+parameterized to encode geometry and texture priors. For each incoming scene,
+we update the neural volume by aggregating 2D image features in a
+feature-similarity-driven manner. The updated neural volume is decoded into the
+radiance field, enabling novel view synthesis from any viewpoint. On both
+object-centric and scene-level datasets, we show that LEAP significantly
+outperforms prior methods when they employ predicted poses from
+state-of-the-art pose estimators. Notably, LEAP performs on par with prior
+approaches that use ground-truth poses while running $400\times$ faster than
+PixelNeRF. We show LEAP generalizes to novel object categories and scenes, and
+learns knowledge closely resembles epipolar geometry. Project page:
+https://hwjiang1510.github.io/LEAP/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page https://hwjiang1510.github.io/LEAP/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Diffusion Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangfu Mei, Mauricio Delbracio, Hossein Talebi, Zhengzhong Tu, Vishal M. Patel, Peyman Milanfar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative diffusion models provide strong priors for text-to-image
+generation and thereby serve as a foundation for conditional generation tasks
+such as image editing, restoration, and super-resolution. However, one major
+limitation of diffusion models is their slow sampling time. To address this
+challenge, we present a novel conditional distillation method designed to
+supplement the diffusion priors with the help of image conditions, allowing for
+conditional sampling with very few steps. We directly distill the unconditional
+pre-training in a single stage through joint-learning, largely simplifying the
+previous two-stage procedures that involve both distillation and conditional
+finetuning separately. Furthermore, our method enables a new
+parameter-efficient distillation mechanism that distills each task with only a
+small number of additional parameters combined with the shared frozen
+unconditional backbone. Experiments across multiple tasks including
+super-resolution, image editing, and depth-to-image generation demonstrate that
+our method outperforms existing distillation techniques for the same sampling
+time. Notably, our method is the first distillation strategy that can match the
+performance of the much slower fine-tuned conditional diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HumanNorm: Learning Normal Diffusion Model for High-quality and
+  Realistic 3D Human Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Huang, Ruizhi Shao, Qi Zhang, Hongwen Zhang, Ying Feng, Yebin Liu, Qing Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-3D methods employing diffusion models have made significant
+advancements in 3D human generation. However, these approaches face challenges
+due to the limitations of the text-to-image diffusion model, which lacks an
+understanding of 3D structures. Consequently, these methods struggle to achieve
+high-quality human generation, resulting in smooth geometry and cartoon-like
+appearances. In this paper, we observed that fine-tuning text-to-image
+diffusion models with normal maps enables their adaptation into text-to-normal
+diffusion models, which enhances the 2D perception of 3D geometry while
+preserving the priors learned from large-scale datasets. Therefore, we propose
+HumanNorm, a novel approach for high-quality and realistic 3D human generation
+by learning the normal diffusion model including a normal-adapted diffusion
+model and a normal-aligned diffusion model. The normal-adapted diffusion model
+can generate high-fidelity normal maps corresponding to prompts with
+view-dependent text. The normal-aligned diffusion model learns to generate
+color images aligned with the normal maps, thereby transforming physical
+geometry details into realistic appearance. Leveraging the proposed normal
+diffusion model, we devise a progressive geometry generation strategy and
+coarse-to-fine texture generation strategy to enhance the efficiency and
+robustness of 3D human generation. Comprehensive experiments substantiate our
+method's ability to generate 3D humans with intricate geometry and realistic
+appearances, significantly outperforming existing text-to-3D methods in both
+geometry and texture quality. The project page of HumanNorm is
+https://humannorm.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project page of HumanNorm is https://humannorm.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ H-InDex: Visual Reinforcement Learning with Hand-Informed
+  Representations for Dexterous Manipulation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Ze, Yuyao Liu, Ruizhe Shi, Jiaxin Qin, Zhecheng Yuan, Jiashun Wang, Huazhe Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human hands possess remarkable dexterity and have long served as a source of
+inspiration for robotic manipulation. In this work, we propose a human
+$\textbf{H}$and$\textbf{-In}$formed visual representation learning framework to
+solve difficult $\textbf{Dex}$terous manipulation tasks ($\textbf{H-InDex}$)
+with reinforcement learning. Our framework consists of three stages: (i)
+pre-training representations with 3D human hand pose estimation, (ii) offline
+adapting representations with self-supervised keypoint detection, and (iii)
+reinforcement learning with exponential moving average BatchNorm. The last two
+stages only modify $0.36\%$ parameters of the pre-trained representation in
+total, ensuring the knowledge from pre-training is maintained to the full
+extent. We empirically study 12 challenging dexterous manipulation tasks and
+find that H-InDex largely surpasses strong baseline methods and the recent
+visual foundation models for motor control. Code is available at
+https://yanjieze.com/H-InDex .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Code and videos: https://yanjieze.com/H-InDex</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIPSelf: Vision <span class="highlight-title">Transformer</span> Distills Itself for Open-Vocabulary Dense
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Xiangtai Li, Wentao Liu, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary dense prediction tasks including object detection and image
+segmentation have been advanced by the success of Contrastive Language-Image
+Pre-training (CLIP). CLIP models, particularly those incorporating vision
+transformers (ViTs), have exhibited remarkable generalization ability in
+zero-shot image classification. However, when transferring the vision-language
+alignment of CLIP from global image representation to local region
+representation for the open-vocabulary dense prediction tasks, CLIP ViTs suffer
+from the domain shift from full images to local image regions. In this paper,
+we embark on an in-depth analysis of the region-language alignment in CLIP
+models, which is essential for downstream open-vocabulary dense prediction
+tasks. Subsequently, we propose an approach named CLIPSelf, which adapts the
+image-level recognition ability of CLIP ViT to local image regions without
+needing any region-text pairs. CLIPSelf empowers ViTs to distill itself by
+aligning a region representation extracted from its dense feature map with the
+image-level representation of the corresponding image crop. With the enhanced
+CLIP ViTs, we achieve new state-of-the-art performance on open-vocabulary
+object detection, semantic segmentation, and panoptic segmentation across
+various benchmarks. Models and code will be available at
+https://github.com/wusize/CLIPSelf.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pixel-Aligned Recurrent Queries for Multi-View 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Xie, Huaizu Jiang, Georgia Gkioxari, Julian Straub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present PARQ - a multi-view 3D object detector with transformer and
+pixel-aligned recurrent queries. Unlike previous works that use learnable
+features or only encode 3D point positions as queries in the decoder, PARQ
+leverages appearance-enhanced queries initialized from reference points in 3D
+space and updates their 3D location with recurrent cross-attention operations.
+Incorporating pixel-aligned features and cross attention enables the model to
+encode the necessary 3D-to-2D correspondences and capture global contextual
+information of the input images. PARQ outperforms prior best methods on the
+ScanNet and ARKitScenes datasets, learns and detects faster, is more robust to
+distribution shifts in reference points, can leverage additional input views
+without retraining, and can adapt inference compute by changing the number of
+recurrent iterations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project page: https://ymingxie.github.io/parq</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequential Data Generation with Groupwise Diffusion Process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangyun Lee, Gayoung Lee, Hyunsu Kim, Junho Kim, Youngjung Uh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Groupwise Diffusion Model (GDM), which divides data into
+multiple groups and diffuses one group at one time interval in the forward
+diffusion process. GDM generates data sequentially from one group at one time
+interval, leading to several interesting properties. First, as an extension of
+diffusion models, GDM generalizes certain forms of autoregressive models and
+cascaded diffusion models. As a unified framework, GDM allows us to investigate
+design choices that have been overlooked in previous works, such as
+data-grouping strategy and order of generation. Furthermore, since one group of
+the initial noise affects only a certain group of the generated data, latent
+space now possesses group-wise interpretable meaning. We can further extend GDM
+to the frequency domain where the forward process sequentially diffuses each
+group of frequency components. Dividing the frequency bands of the data as
+groups allows the latent variables to become a hierarchical representation
+where individual groups encode data at different levels of abstraction. We
+demonstrate several applications of such representation including
+disentanglement of semantic attributes, image editing, and generating
+variations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DST-Det: Simple Dynamic Self-Training for Open-Vocabulary Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilin Xu, Xiangtai Li, Size Wu, Wenwei Zhang, Yining Li, Guangliang Cheng, Yunhai Tong, Kai Chen, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary object detection (OVOD) aims to detect the objects beyond the
+set of categories observed during training. This work presents a simple yet
+effective strategy that leverages the zero-shot classification ability of
+pre-trained vision-language models (VLM), such as CLIP, to classify proposals
+for all possible novel classes directly. Unlike previous works that ignore
+novel classes during training and rely solely on the region proposal network
+(RPN) for novel object detection, our method selectively filters proposals
+based on specific design criteria. The resulting sets of identified proposals
+serve as pseudo-labels for novel classes during the training phase. It enables
+our self-training strategy to improve the recall and accuracy of novel classes
+in a self-training manner without requiring additional annotations or datasets.
+We further propose a simple offline pseudo-label generation strategy to refine
+the object detector. Empirical evaluations on three datasets, including LVIS,
+V3Det, and COCO, demonstrate significant improvements over the baseline
+performance without incurring additional parameters or computational costs
+during inference. In particular, compared with previous F-VLM, our method
+achieves a 1.7-2.0% improvement on LVIS dataset and 2.3-3.8% improvement on the
+recent challenging V3Det dataset. Our method also boosts the strong baseline by
+6% mAP on COCO. The code and models will be publicly available at
+https://github.com/xushilin1/dst-det.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EXTRACTER: Efficient Texture Matching with Attention and Gradient
+  Enhancing for Large Scale Image Super Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Esteban Reyes-Saldana, Mariano Rivera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent Reference-Based image super-resolution (RefSR) has improved SOTA deep
+methods introducing attention mechanisms to enhance low-resolution images by
+transferring high-resolution textures from a reference high-resolution image.
+The main idea is to search for matches between patches using LR and Reference
+image pair in a feature space and merge them using deep architectures. However,
+existing methods lack the accurate search of textures. They divide images into
+as many patches as possible, resulting in inefficient memory usage, and cannot
+manage large images. Herein, we propose a deep search with a more efficient
+memory usage that reduces significantly the number of image patches and finds
+the $k$ most relevant texture match for each low-resolution patch over the
+high-resolution reference patches, resulting in an accurate texture match. We
+enhance the Super Resolution result adding gradient density information using a
+simple residual architecture showing competitive metrics results: PSNR and
+SSMI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Distribution-Agnostic Generalized Category Discovery <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhong Bai, Zuozhu Liu, Hualiang Wang, Ruizhe Chen, Lianrui Mu, Xiaomeng Li, Joey Tianyi Zhou, Yang Feng, Jian Wu, Haoji Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data imbalance and open-ended distribution are two intrinsic characteristics
+of the real visual world. Though encouraging progress has been made in tackling
+each challenge separately, few works dedicated to combining them towards
+real-world scenarios. While several previous works have focused on classifying
+close-set samples and detecting open-set samples during testing, it's still
+essential to be able to classify unknown subjects as human beings. In this
+paper, we formally define a more realistic task as distribution-agnostic
+generalized category discovery (DA-GCD): generating fine-grained predictions
+for both close- and open-set classes in a long-tailed open-world setting. To
+tackle the challenging problem, we propose a Self-Balanced Co-Advice
+contrastive framework (BaCon), which consists of a contrastive-learning branch
+and a pseudo-labeling branch, working collaboratively to provide interactive
+supervision to resolve the DA-GCD task. In particular, the contrastive-learning
+branch provides reliable distribution estimation to regularize the predictions
+of the pseudo-labeling branch, which in turn guides contrastive learning
+through self-balanced knowledge transfer and a proposed novel contrastive loss.
+We compare BaCon with state-of-the-art methods from two closely related fields:
+imbalanced semi-supervised learning and generalized category discovery. The
+effectiveness of BaCon is demonstrated with superior performance over all
+baselines and comprehensive analysis across various datasets. Our code is
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenSim: Generating Robotic Simulation Tasks via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lirui Wang, Yiyang Ling, Zhecheng Yuan, Mohit Shridhar, Chen Bao, Yuzhe Qin, Bailin Wang, Huazhe Xu, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting large amounts of real-world interaction data to train general
+robotic policies is often prohibitively expensive, thus motivating the use of
+simulation data. However, existing methods for data generation have generally
+focused on scene-level diversity (e.g., object instances and poses) rather than
+task-level diversity, due to the human effort required to come up with and
+verify novel tasks. This has made it challenging for policies trained on
+simulation data to demonstrate significant task-level generalization. In this
+paper, we propose to automatically generate rich simulation environments and
+expert demonstrations by exploiting a large language models' (LLM) grounding
+and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed
+generation, wherein a target task is given to the LLM and the LLM proposes a
+task curriculum to solve the target task, and exploratory generation, wherein
+the LLM bootstraps from previous tasks and iteratively proposes novel tasks
+that would be helpful in solving more complex tasks. We use GPT4 to expand the
+existing benchmark by ten times to over 100 tasks, on which we conduct
+supervised finetuning and evaluate several LLMs including finetuned GPTs and
+Code Llama on code generation for robotic simulation tasks. Furthermore, we
+observe that LLMs-generated simulation programs can enhance task-level
+generalization significantly when used for multitask policy training. We
+further find that with minimal sim-to-real adaptation, the multitask policies
+pretrained on GPT4-generated simulation tasks exhibit stronger transfer to
+unseen long-horizon tasks in the real world and outperform baselines by 25%.
+See the project website (https://liruiw.github.io/gensim) for code, demos, and
+videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See our project website (https://liruiw.github.io/gensim), demo
+  (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code
+  (https://github.com/liruiw/GenSim) for visualizations and open-source models
+  and datasets</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NEUCORE: Neural Concept Reasoning for Composed Image Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shu Zhao, Huijuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Composed image retrieval which combines a reference image and a text modifier
+to identify the desired target image is a challenging task, and requires the
+model to comprehend both vision and language modalities and their interactions.
+Existing approaches focus on holistic multi-modal interaction modeling, and
+ignore the composed and complimentary property between the reference image and
+text modifier. In order to better utilize the complementarity of multi-modal
+inputs for effective information fusion and retrieval, we move the multi-modal
+understanding to fine-granularity at concept-level, and learn the multi-modal
+concept alignment to identify the visual location in reference or target images
+corresponding to text modifier. Toward the end, we propose a NEUral COncept
+REasoning (NEUCORE) model which incorporates multi-modal concept alignment and
+progressive multimodal fusion over aligned concepts. Specifically, considering
+that text modifier may refer to semantic concepts not existing in the reference
+image and requiring to be added into the target image, we learn the multi-modal
+concept alignment between the text modifier and the concatenation of reference
+and target images, under multiple-instance learning framework with image and
+sentence level weak supervision. Furthermore, based on aligned concepts, to
+form discriminative fusion features of the input modalities for accurate target
+image retrieval, we propose a progressive fusion strategy with unified
+execution architecture instantiated by the attended language semantic concepts.
+Our proposed approach is evaluated on three datasets and achieves
+state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Less is More: Toward Zero-Shot Local Scene Graph Generation via
+  Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shu Zhao, Huijuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans inherently recognize objects via selective visual perception,
+transform specific regions from the visual field into structured symbolic
+knowledge, and reason their relationships among regions based on the allocation
+of limited attention resources in line with humans' goals. While it is
+intuitive for humans, contemporary perception systems falter in extracting
+structural information due to the intricate cognitive abilities and commonsense
+knowledge required. To fill this gap, we present a new task called Local Scene
+Graph Generation. Distinct from the conventional scene graph generation task,
+which encompasses generating all objects and relationships in an image, our
+proposed task aims to abstract pertinent structural information with partial
+objects and their relationships for boosting downstream tasks that demand
+advanced comprehension and reasoning capabilities. Correspondingly, we
+introduce zEro-shot Local scEne GrAph geNeraTion (ELEGANT), a framework
+harnessing foundation models renowned for their powerful perception and
+commonsense reasoning, where collaboration and information communication among
+foundation models yield superior outcomes and realize zero-shot local scene
+graph generation without requiring labeled supervision. Furthermore, we propose
+a novel open-ended evaluation metric, Entity-level CLIPScorE (ECLIPSE),
+surpassing previous closed-set evaluation metrics by transcending their limited
+label space, offering a broader assessment. Experiment results show that our
+approach markedly outperforms baselines in the open-ended evaluation setting,
+and it also achieves a significant performance boost of up to 24.58% over prior
+methods in the close-set setting, demonstrating the effectiveness and powerful
+reasoning ability of our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Streaming Motion Forecasting for Autonomous Driving <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Pang, Deva Ramanan, Mengtian Li, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory forecasting is a widely-studied problem for autonomous navigation.
+However, existing benchmarks evaluate forecasting based on independent
+snapshots of trajectories, which are not representative of real-world
+applications that operate on a continuous stream of data. To bridge this gap,
+we introduce a benchmark that continuously queries future trajectories on
+streaming data and we refer to it as "streaming forecasting." Our benchmark
+inherently captures the disappearance and re-appearance of agents, presenting
+the emergent challenge of forecasting for occluded agents, which is a
+safety-critical problem yet overlooked by snapshot-based benchmarks. Moreover,
+forecasting in the context of continuous timestamps naturally asks for temporal
+coherence between predictions from adjacent timestamps. Based on this
+benchmark, we further provide solutions and analysis for streaming forecasting.
+We propose a plug-and-play meta-algorithm called "Predictive Streamer" that can
+adapt any snapshot-based forecaster into a streaming forecaster. Our algorithm
+estimates the states of occluded agents by propagating their positions with
+multi-modal trajectories, and leverages differentiable filters to ensure
+temporal consistency. Both occlusion reasoning and temporal coherence
+strategies significantly improve forecasting quality, resulting in 25% smaller
+endpoint errors for occluded agents and 10-20% smaller fluctuations of
+trajectories. Our work is intended to generate interest within the community by
+highlighting the importance of addressing motion forecasting in its intrinsic
+streaming setting. Code is available at
+https://github.com/ziqipang/StreamingForecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2023, 8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards reporting bias in visual-language <span class="highlight-title">dataset</span>s: bimodal augmentation
+  by decoupling object-attribute association 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyu Wu, Mengjie Zhao, Yutong He, Lang Huang, Junya Ono, Hiromi Wakaki, Yuki Mitsufuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reporting bias arises when people assume that some knowledge is universally
+understood and hence, do not necessitate explicit elaboration. In this paper,
+we focus on the wide existence of reporting bias in visual-language datasets,
+embodied as the object-attribute association, which can subsequentially degrade
+models trained on them. To mitigate this bias, we propose a bimodal
+augmentation (BiAug) approach through object-attribute decoupling to flexibly
+synthesize visual-language examples with a rich array of object-attribute
+pairing and construct cross-modal hard negatives. We employ large language
+models (LLMs) in conjunction with a grounding object detector to extract target
+objects. Subsequently, the LLM generates a detailed attribute description for
+each object and produces a corresponding hard negative counterpart. An
+inpainting model is then used to create images based on these detailed object
+descriptions. By doing so, the synthesized examples explicitly complement
+omitted objects and attributes to learn, and the hard negative pairs steer the
+model to distinguish object attributes. Our experiments demonstrated that BiAug
+is superior in object-attribute understanding. In addition, BiAug also improves
+the performance on zero-shot retrieval tasks on general benchmarks like MSCOCO
+and Flickr30K. BiAug refines the way of collecting text-image datasets.
+Mitigating the reporting bias helps models achieve a deeper understanding of
+visual-language phenomena, expanding beyond mere frequent patterns to encompass
+the richness and diversity of real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZeroI2V: Zero-Cost Adaptation of <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span>s from Image to
+  Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Li, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting image models to video domain is becoming an efficient paradigm for
+solving video recognition tasks. Due to the huge number of parameters and
+effective transferability of image models, performing full fine-tuning is less
+efficient and even unnecessary. Thus, recent research is shifting its focus
+towards parameter-efficient image-to-video adaptation. However, these
+adaptation strategies inevitably introduce extra computational cost to deal
+with the domain gap and temporal modeling in videos. In this paper, our goal is
+to present a zero-cost adaptation paradigm (ZeroI2V) to transfer the image
+transformers to video recognition tasks (i.e., introduce zero extra cost to the
+adapted models during inference). To achieve this goal, we present two core
+designs. First, to capture the dynamics in videos and reduce the difficulty of
+achieving image-to-video adaptation, we exploit the flexibility of
+self-attention and introduce the spatial-temporal dual-headed attention (STDHA)
+that efficiently endow the image transformers with temporal modeling capability
+at zero extra parameters and computation. Second, to handle the domain gap
+between images and videos, we propose a linear adaption strategy which utilizes
+lightweight densely placed linear adapters to fully transfer the frozen image
+models to video recognition. Due to its customized linear design, all newly
+added adapters could be easily merged with the original modules through
+structural reparameterization after training, thus achieving zero extra cost
+during inference. Extensive experiments on four widely-used video recognition
+benchmarks show that our ZeroI2V can match or even outperform previous
+state-of-the-art methods while enjoying superior parameter and inference
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Color and Texture Dual Pipeline Lightweight Style Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ShiQi Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Style transfer methods typically generate a single stylized output of color
+and texture coupling for reference styles, and color transfer schemes may
+introduce distortion or artifacts when processing reference images with
+duplicate textures. To solve the problem, we propose a Color and Texture Dual
+Pipeline Lightweight Style Transfer CTDP method, which employs a dual pipeline
+method to simultaneously output the results of color and texture transfer.
+Furthermore, we designed a masked total variation loss to suppress artifacts
+and small texture representations in color transfer results without affecting
+the semantic part of the content. More importantly, we are able to add texture
+structures with controllable intensity to color transfer results for the first
+time. Finally, we conducted feature visualization analysis on the texture
+generation mechanism of the framework and found that smoothing the input image
+can almost completely eliminate this texture structure. In comparative
+experiments, the color and texture transfer results generated by CTDP both
+achieve state-of-the-art performance. Additionally, the weight of the color
+transfer branch model size is as low as 20k, which is 100-1500 times smaller
+than that of other state-of-the-art models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Remote Sensing Segmentation With Generative Adversarial
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyi Qiu, Dayu Yu, Xiaofeng Zhang, Chenxiao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most deep learning methods that achieve high segmentation accuracy require
+deep network architectures that are too heavy and complex to run on embedded
+devices with limited storage and memory space. To address this issue, this
+paper proposes an efficient Generative Adversarial Transfomer (GATrans) for
+achieving high-precision semantic segmentation while maintaining an extremely
+efficient size. The framework utilizes a Global Transformer Network (GTNet) as
+the generator, efficiently extracting multi-level features through residual
+connections. GTNet employs global transformer blocks with progressively linear
+computational complexity to reassign global features based on a learnable
+similarity function. To focus on object-level and pixel-level information, the
+GATrans optimizes the objective function by combining structural similarity
+losses. We validate the effectiveness of our approach through extensive
+experiments on the Vaihingen dataset, achieving an average F1 score of 90.17%
+and an overall accuracy of 91.92%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3DHR-Co: A Collaborative Test-time Refinement Framework for In-the-Wild
+  3D Human-Body Reconstruction Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Samuel Lumentut, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of 3D human-body reconstruction (abbreviated as 3DHR) that utilizes
+parametric pose and shape representations has witnessed significant
+advancements in recent years. However, the application of 3DHR techniques to
+handle real-world, diverse scenes, known as in-the-wild data, still faces
+limitations. The primary challenge arises as curating accurate 3D human pose
+ground truth (GT) for in-the-wild scenes is still difficult to obtain due to
+various factors. Recent test-time refinement approaches on 3DHR leverage
+initial 2D off-the-shelf human keypoints information to support the lack of 3D
+supervision on in-the-wild data. However, we observed that additional 2D
+supervision alone could cause the overfitting issue on common 3DHR backbones,
+making the 3DHR test-time refinement task seem intractable. We answer this
+challenge by proposing a strategy that complements 3DHR test-time refinement
+work under a collaborative approach. Specifically, we initially apply a
+pre-adaptation approach that works by collaborating various 3DHR models in a
+single framework to directly improve their initial outputs. This approach is
+then further combined with the test-time adaptation work under specific
+settings that minimize the overfitting issue to further boost the 3DHR
+performance. The whole framework is termed as 3DHR-Co, and on the experiment
+sides, we showed that the proposed work can significantly enhance the scores of
+common classic 3DHR backbones up to -34 mm pose error suppression, putting them
+among the top list on the in-the-wild benchmark data. Such achievement shows
+that our approach helps unveil the true potential of the common classic 3DHR
+backbones. Based on these findings, we further investigate various settings on
+the proposed framework to better elaborate the capability of our collaborative
+approach in the 3DHR task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Offline Tracking with Object Permanence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianzhong Liu, Holger Caesar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To reduce the expensive labor cost for manual labeling autonomous driving
+datasets, an alternative is to automatically label the datasets using an
+offline perception system. However, objects might be temporally occluded. Such
+occlusion scenarios in the datasets are common yet underexplored in offline
+autolabeling. In this work, we propose an offline tracking model that focuses
+on occluded object tracks. It leverages the concept of object permanence which
+means objects continue to exist even if they are not observed anymore. The
+model contains three parts: a standard online tracker, a re-identification
+(Re-ID) module that associates tracklets before and after occlusion, and a
+track completion module that completes the fragmented tracks. The Re-ID module
+and the track completion module use the vectorized map as one of the inputs to
+refine the tracking results with occlusion. The model can effectively recover
+the occluded object trajectories. It achieves state-of-the-art performance in
+3D multi-object tracking by improving over the original online tracking result
+by 45% IDS and 2% AMOTA on the vehicle tracks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MobileNVC: Real-time 1080p Neural Video Compression on a Mobile Device 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ties van Rozendaal, Tushar Singhal, Hoang Le, Guillaume Sautiere, Amir Said, Krishna Buska, Anjuman Raha, Dimitris Kalatzis, Hitarth Mehta, Frank Mayer, Liang Zhang, Markus Nagel, Auke Wiggers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural video codecs have recently become competitive with standard codecs
+such as HEVC in the low-delay setting. However, most neural codecs are large
+floating-point networks that use pixel-dense warping operations for temporal
+modeling, making them too computationally expensive for deployment on mobile
+devices. Recent work has demonstrated that running a neural decoder in real
+time on mobile is feasible, but shows this only for 720p RGB video, while the
+YUV420 format is more commonly used in production. This work presents the first
+neural video codec that decodes 1080p YUV420 video in real time on a mobile
+device. Our codec relies on two major contributions. First, we design an
+efficient codec that uses a block-based motion compensation algorithm available
+on the warping core of the mobile accelerator, and we show how to quantize this
+model to integer precision. Second, we implement a fast decoder pipeline that
+concurrently runs neural network components on the neural signal processor,
+parallel entropy coding on the mobile GPU, and warping on the warping core. Our
+codec outperforms the previous on-device codec by a large margin with up to 48
+% BD-rate savings, while reducing the MAC count on the receiver side by 10x. We
+perform a careful ablation to demonstrate the effect of the introduced motion
+compensation scheme, and ablate the effect of model quantization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating 3D Brain Tumor Regions in MRI using Vector-Quantization
+  Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Zhou, Matthias W Wagner, Uri Tabori, Cynthia Hawkins, Birgit B Ertl-Wagner, Farzad Khalvati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image analysis has significantly benefited from advancements in deep
+learning, particularly in the application of Generative Adversarial Networks
+(GANs) for generating realistic and diverse images that can augment training
+datasets. However, the effectiveness of such approaches is often limited by the
+amount of available data in clinical settings. Additionally, the common
+GAN-based approach is to generate entire image volumes, rather than solely the
+region of interest (ROI). Research on deep learning-based brain tumor
+classification using MRI has shown that it is easier to classify the tumor ROIs
+compared to the entire image volumes. In this work, we present a novel
+framework that uses vector-quantization GAN and a transformer incorporating
+masked token modeling to generate high-resolution and diverse 3D brain tumor
+ROIs that can be directly used as augmented data for the classification of
+brain tumor ROI. We apply our method to two imbalanced datasets where we
+augment the minority class: (1) the Multimodal Brain Tumor Segmentation
+Challenge (BraTS) 2019 dataset to generate new low-grade glioma (LGG) ROIs to
+balance with high-grade glioma (HGG) class; (2) the internal pediatric LGG
+(pLGG) dataset tumor ROIs with BRAF V600E Mutation genetic marker to balance
+with BRAF Fusion genetic marker class. We show that the proposed method
+outperforms various baseline models in both qualitative and quantitative
+measurements. The generated data was used to balance the data in the brain
+tumor types classification task. Using the augmented data, our approach
+surpasses baseline models by 6.4% in AUC on the BraTS 2019 dataset and 4.3% in
+AUC on our internal pLGG dataset. The results indicate the generated tumor ROIs
+can effectively address the imbalanced data problem. Our proposed method has
+the potential to facilitate an accurate diagnosis of rare brain tumors using
+MRI scans.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, In Submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mirror Diffusion Models for Constrained and Watermarked Generation <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan-Horng Liu, Tianrong Chen, Evangelos A. Theodorou, Molei Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern successes of diffusion models in learning complex, high-dimensional
+data distributions are attributed, in part, to their capability to construct
+diffusion processes with analytic transition kernels and score functions. The
+tractability results in a simulation-free framework with stable regression
+losses, from which reversed, generative processes can be learned at scale.
+However, when data is confined to a constrained set as opposed to a standard
+Euclidean space, these desirable characteristics appear to be lost based on
+prior attempts. In this work, we propose Mirror Diffusion Models (MDM), a new
+class of diffusion models that generate data on convex constrained sets without
+losing any tractability. This is achieved by learning diffusion processes in a
+dual space constructed from a mirror map, which, crucially, is a standard
+Euclidean space. We derive efficient computation of mirror maps for popular
+constrained sets, such as simplices and $\ell_2$-balls, showing significantly
+improved performance of MDM over existing methods. For safety and privacy
+purposes, we also explore constrained sets as a new mechanism to embed
+invisible but quantitative information (i.e., watermarks) in generated data,
+for which MDM serves as a compelling approach. Our work brings new algorithmic
+opportunities for learning tractable diffusion on complex domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to NeurIPS on 5/18 but did not arxiv per NeurIPS policy,
+  accepted on 9/22</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reconstructing 3D Human Pose from RGB-D Data with Occlusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Dang, Xi Zhao, Bowen Zhang, He Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new method to reconstruct the 3D human body from RGB-D images
+with occlusions. The foremost challenge is the incompleteness of the RGB-D data
+due to occlusions between the body and the environment, leading to implausible
+reconstructions that suffer from severe human-scene penetration. To reconstruct
+a semantically and physically plausible human body, we propose to reduce the
+solution space based on scene information and prior knowledge. Our key idea is
+to constrain the solution space of the human body by considering the occluded
+body parts and visible body parts separately: modeling all plausible poses
+where the occluded body parts do not penetrate the scene, and constraining the
+visible body parts using depth data. Specifically, the first component is
+realized by a neural network that estimates the candidate region named the
+"free zone", a region carved out of the open space within which it is safe to
+search for poses of the invisible body parts without concern for penetration.
+The second component constrains the visible body parts using the "truncated
+shadow volume" of the scanned body point cloud. Furthermore, we propose to use
+a volume matching strategy, which yields better performance than surface
+matching, to match the human body with the confined region. We conducted
+experiments on the PROX dataset, and the results demonstrate that our method
+produces more accurate and plausible results compared with other methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision-Language <span class="highlight-title">Dataset</span> Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07545v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07545v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xindi Wu, Byron Zhang, Zhiwei Deng, Olga Russakovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation methods promise to reduce large-scale datasets down to
+significantly smaller sets of (potentially synthetic) training examples, which
+preserve sufficient information for training a new model from scratch. So far,
+dataset distillation methods have been developed for image classification.
+However, with the rise in capabilities of vision-language models (VLMs), and
+especially given the scale of datasets necessary to train these models, the
+time is ripe to expand dataset distillation methods beyond image
+classification. In this work, we take the first steps towards this goal by
+expanding the idea of trajectory matching to create a distillation method for
+vision-language datasets. A key challenge is that vision-language datasets do
+not have a set of discrete classes. To overcome this, our proposed
+vision-language dataset distillation method jointly distills the image-text
+pairs in a contrastive formulation. Since there are no existing baselines, we
+compare our approach to three coreset selection methods (strategic subsampling
+of the training dataset), which we adapt to the vision-language setting. We
+demonstrate significant improvements on the challenging Flickr30K and COCO
+retrieval benchmarks: for example, on Flickr30K, the best coreset selection
+method selecting 1000 image-text pairs for training achieves only 5.6%
+image-to-text retrieval accuracy (i.e., recall@1); in contrast, our dataset
+distillation approach almost doubles that to 9.9% with just 100 (an order of
+magnitude fewer) training pairs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mini<span class="highlight-title">GPT</span>-4: Enhancing Vision-Language Understanding with Advanced Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10592v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10592v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, Mohamed Elhoseiny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent GPT-4 has demonstrated extraordinary multi-modal abilities, such
+as directly generating websites from handwritten text and identifying humorous
+elements within images. These features are rarely observed in previous
+vision-language models. However, the technical details behind GPT-4 continue to
+remain undisclosed. We believe that the enhanced multi-modal generation
+capabilities of GPT-4 stem from the utilization of sophisticated large language
+models (LLM). To examine this phenomenon, we present MiniGPT-4, which aligns a
+frozen visual encoder with a frozen advanced LLM, Vicuna, using one projection
+layer. Our work, for the first time, uncovers that properly aligning the visual
+features with an advanced large language model can possess numerous advanced
+multi-modal abilities demonstrated by GPT-4, such as detailed image description
+generation and website creation from hand-drawn drafts. Furthermore, we also
+observe other emerging capabilities in MiniGPT-4, including writing stories and
+poems inspired by given images, teaching users how to cook based on food
+photos, and so on. In our experiment, we found that the model trained on short
+image caption pairs could produce unnatural language outputs (e.g., repetition
+and fragmentation). To address this problem, we curate a detailed image
+description dataset in the second stage to finetune the model, which
+consequently improves the model's generation reliability and overall usability.
+Our code, pre-trained model, and collected dataset are available at
+https://minigpt-4.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://minigpt-4.github.io/; Code, Pretrained
+  Model, and Dataset: https://github.com/Vision-CAIR/MiniGPT-4; Deyao Zhu and
+  Jun Chen contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot
+  Attention for Vision-and-Language Navigation <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyang Huo, Qiang Sun, Boyan Jiang, Haitao Lin, Yanwei Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing works solving Room-to-Room VLN problem only utilize RGB images
+and do not consider local context around candidate views, which lack sufficient
+visual cues about surrounding environment. Moreover, natural language contains
+complex semantic information thus its correlations with visual inputs are hard
+to model merely with cross attention. In this paper, we propose GeoVLN, which
+learns Geometry-enhanced visual representation based on slot attention for
+robust Visual-and-Language Navigation. The RGB images are compensated with the
+corresponding depth maps and normal maps predicted by Omnidata as visual
+inputs. Technically, we introduce a two-stage module that combine local slot
+attention and CLIP model to produce geometry-enhanced representation from such
+input. We employ V&L BERT to learn a cross-modal representation that
+incorporate both language and vision informations. Additionally, a novel
+multiway attention module is designed, encouraging different phrases of input
+instruction to exploit the most related features from visual input. Extensive
+experiments demonstrate the effectiveness of our newly designed modules and
+show the compelling performance of the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exact Diffusion Inversion via Bi-directional Integration Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10829v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10829v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqiang Zhang, J. P. Lewis, W. Bastiaan Kleijn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, various methods have been proposed to address the inconsistency
+issue of DDIM inversion to enable image editing, such as EDICT [36] and
+Null-text inversion [22]. However, the above methods introduce considerable
+computational overhead. In this paper, we propose a new technique, named
+\emph{bi-directional integration approximation} (BDIA), to perform exact
+diffusion inversion with neglible computational overhead. Suppose we would like
+to estimate the next diffusion state $\boldsymbol{z}_{i-1}$ at timestep $t_i$
+with the historical information $(i,\boldsymbol{z}_i)$ and
+$(i+1,\boldsymbol{z}_{i+1})$. We first obtain the estimated Gaussian noise
+$\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i)$, and then apply the DDIM
+update procedure twice for approximating the ODE integration over the next
+time-slot $[t_i, t_{i-1}]$ in the forward manner and the previous time-slot
+$[t_i, t_{t+1}]$ in the backward manner. The DDIM step for the previous
+time-slot is used to refine the integration approximation made earlier when
+computing $\boldsymbol{z}_i$. A nice property of BDIA-DDIM is that the update
+expression for $\boldsymbol{z}_{i-1}$ is a linear combination of
+$(\boldsymbol{z}_{i+1}, \boldsymbol{z}_i,
+\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i))$. This allows for exact
+backward computation of $\boldsymbol{z}_{i+1}$ given $(\boldsymbol{z}_i,
+\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. It is
+demonstrated with experiments that (round-trip) BDIA-DDIM is particularly
+effective for image editing. Our experiments further show that BDIA-DDIM
+produces markedly better image sampling qualities than DDIM for text-to-image
+generation.
+  BDIA can also be applied to improve the performance of other ODE solvers in
+addition to DDIM. In our work, it is found that applying BDIA to the EDM
+sampling procedure produces new SOTA performance over CIFAR10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.11328. Our code is
+  available at https://github.com/guoqiang-zhang-x/BDIA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mask and Restore: Blind Backdoor Defense at Test Time with Masked
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Sun, Lu Pang, Chao Chen, Haibin Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to backdoor attacks, where an adversary
+maliciously manipulates the model behavior through overlaying images with
+special triggers. Existing backdoor defense methods often require accessing a
+few validation data and model parameters, which are impractical in many
+real-world applications, e.g., when the model is provided as a cloud service.
+In this paper, we address the practical task of blind backdoor defense at test
+time, in particular for black-box models. The true label of every test image
+needs to be recovered on the fly from a suspicious model regardless of image
+benignity. We focus on test-time image purification methods that incapacitate
+possible triggers while keeping semantic contents intact. Due to diverse
+trigger patterns and sizes, the heuristic trigger search in image space can be
+unscalable. We circumvent such barrier by leveraging the strong reconstruction
+power of generative models, and propose a framework of Blind Defense with
+Masked AutoEncoder (BDMAE). It detects possible triggers in the token space
+using image structural similarity and label consistency between the test image
+and MAE restorations. The detection results are then refined by considering
+trigger topology. Finally, we fuse MAE restorations adaptively into a purified
+image for making prediction. Our approach is blind to the model architectures,
+trigger patterns and image benignity. Extensive experiments under different
+backdoor settings validate its effectiveness and generalizability. Code is
+available at https://github.com/tsun/BDMAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tuning <span class="highlight-title">Pre-train</span>ed Model via Moment Probing <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11342v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11342v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingze Gao, Qilong Wang, Zhenyi Lin, Pengfei Zhu, Qinghua Hu, Jingbo Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, efficient fine-tuning of large-scale pre-trained models has
+attracted increasing research interests, where linear probing (LP) as a
+fundamental module is involved in exploiting the final representations for
+task-dependent classification. However, most of the existing methods focus on
+how to effectively introduce a few of learnable parameters, and little work
+pays attention to the commonly used LP module. In this paper, we propose a
+novel Moment Probing (MP) method to further explore the potential of LP.
+Distinguished from LP which builds a linear classification head based on the
+mean of final features (e.g., word tokens for ViT) or classification tokens,
+our MP performs a linear classifier on feature distribution, which provides the
+stronger representation ability by exploiting richer statistical information
+inherent in features. Specifically, we represent feature distribution by its
+characteristic function, which is efficiently approximated by using first- and
+second-order moments of features. Furthermore, we propose a multi-head
+convolutional cross-covariance (MHC$^3$) to compute second-order moments in an
+efficient and effective manner. By considering that MP could affect feature
+learning, we introduce a partially shared module to learn two recalibrating
+parameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive
+experiments on ten benchmarks using various models show that our MP
+significantly outperforms LP and is competitive with counterparts at less
+training cost, while our MP$_{+}$ achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023; Project Page:
+  https://github.com/mingzeG/Moment-Probing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factify 2: A Multimodal Fake News and Satire News <span class="highlight-title">Dataset</span> <span class="chip">AAAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03897v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03897v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S Suryavardan, Shreyash Mishra, Parth Patwa, Megha Chakraborty, Anku Rani, Aishwarya Reganti, Aman Chadha, Amitava Das, Amit Sheth, Manoj Chinnakotla, Asif Ekbal, Srijan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The internet gives the world an open platform to express their views and
+share their stories. While this is very valuable, it makes fake news one of our
+society's most pressing problems. Manual fact checking process is time
+consuming, which makes it challenging to disprove misleading assertions before
+they cause significant harm. This is he driving interest in automatic fact or
+claim verification. Some of the existing datasets aim to support development of
+automating fact-checking techniques, however, most of them are text based.
+Multi-modal fact verification has received relatively scant attention. In this
+paper, we provide a multi-modal fact-checking dataset called FACTIFY 2,
+improving Factify 1 by using new data sources and adding satire articles.
+Factify 2 has 50,000 new data instances. Similar to FACTIFY 1.0, we have three
+broad categories - support, no-evidence, and refute, with sub-categories based
+on the entailment of visual and textual data. We also provide a BERT and Vison
+Transformer based baseline, which achieves 65% F1 score in the test set. The
+baseline codes and the dataset will be made available at
+https://github.com/surya1701/Factify-2.0.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Defactify2 @AAAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMICL: Empowering Vision-language Model with Multi-Modal In-Context
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Zhao, Zefan Cai, Shuzheng Si, Xiaojian Ma, Kaikai An, Liang Chen, Zixuan Liu, Sheng Wang, Wenjuan Han, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the resurgence of deep learning, vision-language models (VLMs) enhanced
+by large language models (LLMs) have grown exponentially in popularity.
+However, while LLMs can utilize extensive background knowledge and task
+information with in-context learning, most VLMs still struggle with
+understanding complex multi-modal prompts with multiple images, making VLMs
+less effective in downstream vision-language tasks. In this paper, we address
+the limitation above by 1) introducing MMICL, a new approach to allow the VLM
+to deal with multi-modal inputs efficiently; 2) proposing a novel context
+scheme to augment the in-context learning ability of the VLM; 3) constructing
+the Multi-modal In-Context Learning (MIC) dataset, designed to enhance the
+VLM's ability to understand complex multi-modal prompts. Our experiments
+confirm that MMICL achieves new state-of-the-art zero-shot performance on a
+wide range of general vision-language tasks, especially for complex benchmarks,
+including MME and MMBench. Our analysis demonstrates that MMICL effectively
+tackles the challenge of complex multi-modal prompt understanding and emerges
+the impressive ICL ability. Furthermore, we observe that MMICL successfully
+alleviates language bias in VLMs, a common issue for VLMs that often leads to
+hallucination when faced with extensive textual context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, dataset, checkpoints, and demos are available at
+  https://github.com/PKUnlp-icler/MIC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confidence-Aware and <span class="highlight-title">Self-Supervised</span> Image Anomaly Localisation <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johanna P. Müller, Matthew Baugh, Jeremy Tan, Mischa Dombrowski, Bernhard Kainz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Universal anomaly detection still remains a challenging problem in machine
+learning and medical image analysis. It is possible to learn an expected
+distribution from a single class of normative samples, e.g., through epistemic
+uncertainty estimates, auto-encoding models, or from synthetic anomalies in a
+self-supervised way. The performance of self-supervised anomaly detection
+approaches is still inferior compared to methods that use examples from known
+unknown classes to shape the decision boundary. However, outlier exposure
+methods often do not identify unknown unknowns. Here we discuss an improved
+self-supervised single-class training strategy that supports the approximation
+of probabilistic inference with loosen feature locality constraints. We show
+that up-scaling of gradients with histogram-equalised images is beneficial for
+recently proposed self-supervision tasks. Our method is integrated into several
+out-of-distribution (OOD) detection models and we show evidence that our method
+outperforms the state-of-the-art on various benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for MICCAI UNSURE Workshop 2023 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Total-Recon: Deformable Scene Reconstruction for Embodied View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chonghyuk Song, Gengshan Yang, Kangle Deng, Jun-Yan Zhu, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the task of embodied view synthesis from monocular videos of
+deformable scenes. Given a minute-long RGBD video of people interacting with
+their pets, we render the scene from novel camera trajectories derived from the
+in-scene motion of actors: (1) egocentric cameras that simulate the point of
+view of a target actor and (2) 3rd-person cameras that follow the actor.
+Building such a system requires reconstructing the root-body and articulated
+motion of every actor, as well as a scene representation that supports
+free-viewpoint synthesis. Longer videos are more likely to capture the scene
+from diverse viewpoints (which helps reconstruction) but are also more likely
+to contain larger motions (which complicates reconstruction). To address these
+challenges, we present Total-Recon, the first method to photorealistically
+reconstruct deformable scenes from long monocular RGBD videos. Crucially, to
+scale to long videos, our method hierarchically decomposes the scene into the
+background and objects, whose motion is decomposed into carefully initialized
+root-body motion and local articulations. To quantify such "in-the-wild"
+reconstruction and view synthesis, we collect ground-truth data from a
+specialized stereo RGBD capture rig for 11 challenging videos, significantly
+outperforming prior methods. Our code, model, and data can be found at
+https://andrewsonga.github.io/totalrecon .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 camera-ready version. Project page with code, models, and
+  data: https://andrewsonga.github.io/totalrecon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-based Action Concept Spaces Improve Video <span class="highlight-title">Self-Supervised</span>
+  Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10922v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10922v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanchana Ranasinghe, Michael Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent contrastive language image pre-training has led to learning highly
+transferable and robust image representations. However, adapting these models
+to video domains with minimal supervision remains an open problem. We explore a
+simple step in that direction, using language tied self-supervised learning to
+adapt an image CLIP model to the video domain. A backbone modified for temporal
+modeling is trained under self-distillation settings with train objectives
+operating in an action concept space. Feature vectors of various action
+concepts extracted from a language encoder using relevant textual prompts
+construct this space. We introduce two train objectives, concept distillation
+and concept alignment, that retain generality of original representations while
+enforcing relations between actions and their attributes. Our approach improves
+zero-shot and linear probing performance on three action recognition
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FreMIM: Fourier Transform Meets Masked Image Modeling for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Wang, Jing Wang, Chen Chen, Jianbo Jiao, Lichao Sun, Yuanxiu Cai, Shanshan Song, Jiangyun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research community has witnessed the powerful potential of
+self-supervised Masked Image Modeling (MIM), which enables the models capable
+of learning visual representation from unlabeled data.In this paper, to
+incorporate both the crucial global structural information and local details
+for dense prediction tasks, we alter the perspective to the frequency domain
+and present a new MIM-based framework named FreMIM for self-supervised
+pre-training to better accomplish medical image segmentation task. Based on the
+observations that the detailed structural information mainly lies in the
+high-frequency components and the high-level semantics are abundant in the
+low-frequency counterparts, we further incorporate multi-stage supervision to
+guide the representation learning during the pre-training phase. Extensive
+experiments on three benchmark datasets show the superior advantage of our
+FreMIM over previous state-of-the-art MIM methods. Compared with various
+baselines trained from scratch, our FreMIM could consistently bring
+considerable improvements to model performance. The code will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ mBLIP: Efficient Bootstrapping of Multilingual Vision-LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregor Geigle, Abhay Jain, Radu Timofte, Goran Glavaš
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modular vision-language models (Vision-LLMs) align pretrained image encoders
+with frozen large language models (LLMs), representing a computationally much
+more efficient alternative to end-to-end training of large vision-language
+models from scratch, which is prohibitively expensive for most researchers and
+practitioners. Vision-LLMs instead post-hoc condition LLMs to `understand' the
+output of an image encoder. With the abundance of readily available
+high-quality English image-text data as well as monolingual English LLMs, the
+research focus has been on English-only Vision-LLMs. Multilingual
+vision-language models are still predominantly obtained via expensive
+end-to-end pretraining, resulting in comparatively smaller models, trained on
+limited multilingual image data supplemented with text-only multilingual
+corpora. In this work, we present mBLIP, the first multilingual Vision-LLM,
+which we obtain in a computationally efficient manner -- on consumer hardware
+and using only a few million training examples -- by leveraging a pretrained
+multilingual LLM. To this end, we \textit{re-align} an image encoder previously
+tuned to an English LLM to a new, multilingual LLM -- for this, we leverage
+multilingual data from a mix of vision-and-language tasks, which we obtain by
+machine-translating high-quality English data to 95 languages. On the IGLUE
+benchmark, mBLIP yields results competitive with state-of-the-art models.
+Moreover, in image captioning on XM3600, mBLIP (zero-shot) even outperforms
+PaLI-X (a model with 55B parameters). Compared to these very large multilingual
+vision-language models trained from scratch, we obtain mBLIP by training orders
+of magnitude fewer parameters on magnitudes less data. We release our model and
+code at \url{https://github.com/gregor-ge/mBLIP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Precise localization of corneal reflections in eye images using deep
+  learning trained on synthetic data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Anthony Byrne, Marcus Nyström, Virmarie Maquiling, Enkelejda Kasneci, Diederick C. Niehorster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a deep learning method for accurately localizing the center of a
+single corneal reflection (CR) in an eye image. Unlike previous approaches, we
+use a convolutional neural network (CNN) that was trained solely using
+simulated data. Using only simulated data has the benefit of completely
+sidestepping the time-consuming process of manual annotation that is required
+for supervised training on real eye images. To systematically evaluate the
+accuracy of our method, we first tested it on images with simulated CRs placed
+on different backgrounds and embedded in varying levels of noise. Second, we
+tested the method on high-quality videos captured from real eyes. Our method
+outperformed state-of-the-art algorithmic methods on real eye images with a 35%
+reduction in terms of spatial precision, and performed on par with
+state-of-the-art on simulated images in terms of spatial accuracy.We conclude
+that our method provides a precise method for CR center localization and
+provides a solution to the data availability problem which is one of the
+important common roadblocks in the development of deep learning models for gaze
+estimation. Due to the superior CR center localization and ease of application,
+our method has the potential to improve the accuracy and precision of CR-based
+eye trackers
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Variational Autoencoders for normative modelling across
+  multiple imaging modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12706v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12706v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ana Lawry Aguila, James Chapman, Andre Altmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the challenges of studying common neurological disorders is disease
+heterogeneity including differences in causes, neuroimaging characteristics,
+comorbidities, or genetic variation. Normative modelling has become a popular
+method for studying such cohorts where the 'normal' behaviour of a
+physiological system is modelled and can be used at subject level to detect
+deviations relating to disease pathology. For many heterogeneous diseases, we
+expect to observe abnormalities across a range of neuroimaging and biological
+variables. However, thus far, normative models have largely been developed for
+studying a single imaging modality. We aim to develop a multi-modal normative
+modelling framework where abnormality is aggregated across variables of
+multiple modalities and is better able to detect deviations than uni-modal
+baselines. We propose two multi-modal VAE normative models to detect subject
+level deviations across T1 and DTI data. Our proposed models were better able
+to detect diseased individuals, capture disease severity, and correlate with
+patient cognition than baseline approaches. We also propose a multivariate
+latent deviation metric, measuring deviations from the joint latent space,
+which outperformed feature-based metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVDream: Multi-view Diffusion for 3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichun Shi, Peng Wang, Jianglong Ye, Mai Long, Kejie Li, Xiao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MVDream, a multi-view diffusion model that is able to generate
+consistent multi-view images from a given text prompt. Learning from both 2D
+and 3D data, a multi-view diffusion model can achieve the generalizability of
+2D diffusion models and the consistency of 3D renderings. We demonstrate that
+such a multi-view prior can serve as a generalizable 3D prior that is agnostic
+to 3D representations. It can be applied to 3D generation via Score
+Distillation Sampling, significantly enhancing the consistency and stability of
+existing 2D-lifting methods. It can also learn new concepts from a few 2D
+examples, akin to DreamBooth, but for 3D generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page is https://MV-Dream.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Facade Parsing with Vision <span class="highlight-title">Transformer</span>s and Line Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15523v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15523v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wang, Jiaxing Zhang, Ran Zhang, Yunqin Li, Liangzhi Li, Yuta Nakashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facade parsing stands as a pivotal computer vision task with far-reaching
+applications in areas like architecture, urban planning, and energy efficiency.
+Despite the recent success of deep learning-based methods in yielding
+impressive results on certain open-source datasets, their viability for
+real-world applications remains uncertain. Real-world scenarios are
+considerably more intricate, demanding greater computational efficiency.
+Existing datasets often fall short in representing these settings, and previous
+methods frequently rely on extra models to enhance accuracy, which requires
+much computation cost. In this paper, we introduce Comprehensive Facade Parsing
+(CFP), a dataset meticulously designed to encompass the intricacies of
+real-world facade parsing tasks. Comprising a total of 602 high-resolution
+street-view images, this dataset captures a diverse array of challenging
+scenarios, including sloping angles and densely clustered buildings, with
+painstakingly curated annotations for each image. We introduce a new pipeline
+known as Revision-based Transformer Facade Parsing (RTFP). This marks the
+pioneering utilization of Vision Transformers (ViT) in facade parsing, and our
+experimental results definitively substantiate its merit. We also design Line
+Acquisition, Filtering, and Revision (LAFR), an efficient yet accurate revision
+algorithm that can improve the segment result solely from simple line detection
+using prior knowledge of the facade. In ECP 2011, RueMonge 2014, and our CFP,
+we evaluate the superiority of our method. The dataset and code are available
+at https://github.com/wbw520/RTFP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniQuadric: A SLAM Backend for Unknown Rigid Object 3D Tracking and
+  Light-Weight Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17036v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17036v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linghao Yang, Yanmin Wu, Yu Deng, Rui Tian, Xinggang Hu, Tiefeng Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking and modeling unknown rigid objects in the environment play a crucial
+role in autonomous unmanned systems and virtual-real interactive applications.
+However, many existing Simultaneous Localization, Mapping and Moving Object
+Tracking (SLAMMOT) methods focus solely on estimating specific object poses and
+lack estimation of object scales and are unable to effectively track unknown
+objects. In this paper, we propose a novel SLAM backend that unifies ego-motion
+tracking, rigid object motion tracking, and modeling within a joint
+optimization framework. In the perception part, we designed a pixel-level
+asynchronous object tracker (AOT) based on the Segment Anything Model (SAM) and
+DeAOT, enabling the tracker to effectively track target unknown objects guided
+by various predefined tasks and prompts. In the modeling part, we present a
+novel object-centric quadric parameterization to unify both static and dynamic
+object initialization and optimization. Subsequently, in the part of object
+state estimation, we propose a tightly coupled optimization model for object
+pose and scale estimation, incorporating hybrids constraints into a novel dual
+sliding window optimization framework for joint estimation. To our knowledge,
+we are the first to tightly couple object pose tracking with light-weight
+modeling of dynamic and static objects using quadric. We conduct qualitative
+and quantitative experiments on simulation datasets and real-world datasets,
+demonstrating the state-of-the-art robustness and accuracy in motion estimation
+and modeling. Our system showcases the potential application of object
+perception in complex dynamic scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling the Link Between Image Statistics and Human Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Hepburn, Valero Laparra, Raúl Santos-Rodriguez, Jesús Malo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the 1950s, Barlow and Attneave hypothesised a link between biological
+vision and information maximisation. Following Shannon, information was defined
+using the probability of natural images. A number of physiological and
+psychophysical phenomena have been derived ever since from principles like
+info-max, efficient coding, or optimal denoising. However, it remains unclear
+how this link is expressed in mathematical terms from image probability. First,
+classical derivations were subjected to strong assumptions on the probability
+models and on the behaviour of the sensors. Moreover, the direct evaluation of
+the hypothesis was limited by the inability of the classical image models to
+deliver accurate estimates of the probability. In this work we directly
+evaluate image probabilities using an advanced generative model for natural
+images, and we analyse how probability-related factors can be combined to
+predict human perception via sensitivity of state-of-the-art subjective image
+quality metrics. We use information theory and regression analysis to find a
+combination of just two probability-related factors that achieves 0.8
+correlation with subjective metrics. This probability-based sensitivity is
+psychophysically validated by reproducing the basic trends of the Contrast
+Sensitivity Function, its suprathreshold variation, and trends of the Weber-law
+and masking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modality-Independent Teachers Meet Weakly-Supervised Audio-Visual Event
+  Parser <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yung-Hsuan Lai, Yen-Chun Chen, Yu-Chiang Frank Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual learning has been a major pillar of multi-modal machine
+learning, where the community mostly focused on its modality-aligned setting,
+i.e., the audio and visual modality are both assumed to signal the prediction
+target. With the Look, Listen, and Parse dataset (LLP), we investigate the
+under-explored unaligned setting, where the goal is to recognize audio and
+visual events in a video with only weak labels observed. Such weak video-level
+labels only tell what events happen without knowing the modality they are
+perceived (audio, visual, or both). To enhance learning in this challenging
+setting, we incorporate large-scale contrastively pre-trained models as the
+modality teachers. A simple, effective, and generic method, termed Visual-Audio
+Label Elaboration (VALOR), is innovated to harvest modality labels for the
+training events. Empirical studies show that the harvested labels significantly
+improve an attentional baseline by 8.0 in average F-score (Type@AV).
+Surprisingly, we found that modality-independent teachers outperform their
+modality-fused counterparts since they are noise-proof from the other
+potentially unaligned modality. Moreover, our best model achieves the new
+state-of-the-art on all metrics of LLP by a substantial margin (+5.4 F-score
+for Type@AV). VALOR is further generalized to Audio-Visual Event Localization
+and achieves the new state-of-the-art as well. Code is available at:
+https://github.com/Franklin905/VALOR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subject-driven Text-to-Image Generation via Apprenticeship Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00186v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00186v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhu Chen, Hexiang Hu, Yandong Li, Nataniel Ruiz, Xuhui Jia, Ming-Wei Chang, William W. Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-image generation models like DreamBooth have made remarkable
+progress in generating highly customized images of a target subject, by
+fine-tuning an ``expert model'' for a given subject from a few examples.
+However, this process is expensive, since a new expert model must be learned
+for each subject. In this paper, we present SuTI, a Subject-driven
+Text-to-Image generator that replaces subject-specific fine tuning with
+in-context learning. Given a few demonstrations of a new subject, SuTI can
+instantly generate novel renditions of the subject in different scenes, without
+any subject-specific optimization. SuTI is powered by apprenticeship learning,
+where a single apprentice model is learned from data generated by a massive
+number of subject-specific expert models. Specifically, we mine millions of
+image clusters from the Internet, each centered around a specific visual
+subject. We adopt these clusters to train a massive number of expert models,
+each specializing in a different subject. The apprentice model SuTI then learns
+to imitate the behavior of these fine-tuned experts. SuTI can generate
+high-quality and customized subject-specific images 20x faster than
+optimization-based SoTA methods. On the challenging DreamBench and
+DreamBench-v2, our human evaluation shows that SuTI significantly outperforms
+existing models like InstructPix2Pix, Textual Inversion, Imagic, Prompt2Prompt,
+Re-Imagen and DreamBooth, especially on the subject and text alignment aspects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023. Model Service to be appear as Google Vertex
+  AI - Instant Tuning
+  (https://cloud.google.com/vertex-ai/docs/generative-ai/image/fine-tune-model).
+  The link to demo video:
+  https://www.youtube.com/watch?v=Q2xQ91D_dhM&t=2071s&ab_channel=GoogleCloud</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDIWS: Thermal Imaging <span class="highlight-title">Dataset</span> for Person Detection in Intrusion Warning
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13293v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13293v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nguyen Duc Thuan, Le Hai Anh, Hoang Si Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a synthetic thermal imaging dataset for Person
+Detection in Intrusion Warning Systems (PDIWS). The dataset consists of a
+training set with 2000 images and a test set with 500 images. Each image is
+synthesized by compounding a subject (intruder) with a background using the
+modified Poisson image editing method. There are a total of 50 different
+backgrounds and nearly 1000 subjects divided into five classes according to
+five human poses: creeping, crawling, stooping, climbing and other. The
+presence of the intruder will be confirmed if the first four poses are
+detected. Advanced object detection algorithms have been implemented with this
+dataset and give relatively satisfactory results, with the highest mAP values
+of 95.5% and 90.9% for IoU of 0.5 and 0.75 respectively. The dataset is freely
+published online for research purposes at
+https://github.com/thuan-researcher/Intruder-Thermal-Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are considering some issues in the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Demystifying CLIP Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16671v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16671v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer, Christoph Feichtenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) is an approach that has
+advanced research and applications in computer vision, fueling modern
+recognition systems and generative models. We believe that the main ingredient
+to the success of CLIP is its data and not the model architecture or
+pre-training objective. However, CLIP only provides very limited information
+about its data and how it has been collected, leading to works that aim to
+reproduce CLIP's data by filtering with its model parameters. In this work, we
+intend to reveal CLIP's data curation approach and in our pursuit of making it
+open to the community introduce Metadata-Curated Language-Image Pre-training
+(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's
+concepts) and yields a balanced subset over the metadata distribution. Our
+experimental study rigorously isolates the model and training settings,
+concentrating solely on data. MetaCLIP applied to CommonCrawl with 400M
+image-text data pairs outperforms CLIP's data on multiple standard benchmarks.
+In zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,
+surpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining
+the same training budget, attains 72.4%. Our observations hold across various
+model sizes, exemplified by ViT-H achieving 80.5%, without any
+bells-and-whistles. Curation code and training data distribution on metadata is
+made available at https://github.com/facebookresearch/MetaCLIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by
+  other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Relationships: A New Perspective to Enhance Scene Graph
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06842v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06842v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Jiang, Camillo J. Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a finding that leveraging the hierarchical structures
+among labels for relationships and objects can substantially improve the
+performance of scene graph generation systems. The focus of this work is to
+create an informative hierarchical structure that can divide object and
+relationship categories into disjoint super-categories in a systematic way.
+Specifically, we introduce a Bayesian prediction head to jointly predict the
+super-category of relationships between a pair of object instances, as well as
+the detailed relationship within that super-category simultaneously,
+facilitating more informative predictions. The resulting model exhibits the
+capability to produce a more extensive set of predicates beyond the dataset
+annotations, and to tackle the prevalent issue of low annotation quality. While
+our paper presents preliminary findings, experiments on the Visual Genome
+dataset show its strong performance, particularly in predicate classifications
+and zero-shot settings, that demonstrates the promise of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffeomorphic Deformation via Sliced Wasserstein Distance Optimization
+  for Cortical Surface Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tung Le, Khai Nguyen, Shanlin Sun, Kun Han, Nhat Ho, Xiaohui Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh deformation is a core task for 3D mesh reconstruction, but defining an
+efficient discrepancy between predicted and target meshes remains an open
+problem. A prevalent approach in current deep learning is the set-based
+approach which measures the discrepancy between two surfaces by comparing two
+randomly sampled point-clouds from the two meshes with Chamfer pseudo-distance.
+Nevertheless, the set-based approach still has limitations such as lacking a
+theoretical guarantee for choosing the number of points in sampled
+point-clouds, and the pseudo-metricity and the quadratic complexity of the
+Chamfer divergence. To address these issues, we propose a novel metric for
+learning mesh deformation. The metric is defined by sliced Wasserstein distance
+on meshes represented as probability measures that generalize the set-based
+approach. By leveraging probability measure space, we gain flexibility in
+encoding meshes using diverse forms of probability measures, such as
+continuous, empirical, and discrete measures via \textit{varifold}
+representation. After having encoded probability measures, we can compare
+meshes by using the sliced Wasserstein distance which is an effective optimal
+transport distance with linear computational complexity and can provide a fast
+statistical rate for approximating the surface of meshes. Furthermore, we
+employ a neural ordinary differential equation (ODE) to deform the input
+surface into the target shape by modeling the trajectories of the points on the
+surface. Our experiments on cortical surface reconstruction demonstrate that
+our approach surpasses other competing methods in multiple datasets and
+metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update experimental results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Sparse Visual <span class="highlight-title">Prompt</span> for Domain Adaptive Dense Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09792v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09792v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Senqiao Yang, Jiarui Wu, Jiaming Liu, Xiaoqi Li, Qizhe Zhang, Mingjie Pan, Yulu Gan, Zehui Chen, Shanghang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual prompts have provided an efficient manner in addressing visual
+cross-domain problems. In previous works, Visual Domain Prompt (VDP) first
+introduces domain prompts to tackle the classification Test-Time Adaptation
+(TTA) problem by warping image-level prompts on the input and fine-tuning
+prompts for each target domain. However, since the image-level prompts mask out
+continuous spatial details in the prompt-allocated region, it will suffer from
+inaccurate contextual information and limited domain knowledge extraction,
+particularly when dealing with dense prediction TTA problems. To overcome these
+challenges, we propose a novel Sparse Visual Domain Prompts (SVDP) approach,
+which holds minimal trainable parameters (e.g., 0.1\%) in the image-level
+prompt and reserves more spatial information of the input. To better apply SVDP
+in extracting domain-specific knowledge, we introduce the Domain Prompt
+Placement (DPP) method to adaptively allocates trainable parameters of SVDP on
+the pixels with large distribution shifts. Furthermore, recognizing that each
+target domain sample exhibits a unique domain shift, we design Domain Prompt
+Updating (DPU) strategy to optimize prompt parameters differently for each
+sample, facilitating efficient adaptation to the target domain. Extensive
+experiments were conducted on widely-used TTA and continual TTA benchmarks, and
+our proposed method achieves state-of-the-art performance in both semantic
+segmentation and depth estimation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparkles: Unlocking Chats Across Multiple Images for Multimodal
+  Instruction-Following Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupan Huang, Zaiqiao Meng, Fangyu Liu, Yixuan Su, Nigel Collier, Yutong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models exhibit enhanced zero-shot performance on various tasks
+when fine-tuned with instruction-following data. Multimodal
+instruction-following models extend these capabilities by integrating both text
+and images. However, existing models such as MiniGPT-4 face challenges in
+maintaining dialogue coherence in scenarios involving multiple images. A
+primary reason is the lack of a specialized dataset for this critical
+application. To bridge these gaps, we present SparklesChat, a multimodal
+instruction-following model for open-ended dialogues across multiple images. To
+support the training, we introduce SparklesDialogue, the first
+machine-generated dialogue dataset tailored for word-level interleaved
+multi-image and text interactions. Furthermore, we construct SparklesEval, a
+GPT-assisted benchmark for quantitatively assessing a model's conversational
+competence across multiple images and dialogue turns. Our experiments validate
+the effectiveness of SparklesChat in understanding and reasoning across
+multiple images and dialogue turns. Specifically, SparklesChat outperformed
+MiniGPT-4 on established vision-and-language benchmarks, including the BISON
+binary image selection task and the NLVR2 visual reasoning task. Moreover,
+SparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding
+MiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative
+evaluations further demonstrate SparklesChat's generality in handling
+real-world applications. All resources are available at
+https://github.com/HYPJUDY/Sparkles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Reduced main content to 9 pages; typos corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bootstrap Diffusion Model Curve Estimation for High Resolution Low-Light
+  Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14709v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14709v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiancheng Huang, Yifan Liu, Shifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based methods have attracted a lot of research attention and led to
+significant improvements in low-light image enhancement. However, most of them
+still suffer from two main problems: expensive computational cost in high
+resolution images and unsatisfactory performance in simultaneous enhancement
+and denoising. To address these problems, we propose BDCE, a bootstrap
+diffusion model that exploits the learning of the distribution of the curve
+parameters instead of the normal-light image itself. Specifically, we adopt the
+curve estimation method to handle the high-resolution images, where the curve
+parameters are estimated by our bootstrap diffusion model. In addition, a
+denoise module is applied in each iteration of curve adjustment to denoise the
+intermediate enhanced result of each iteration. We evaluate BDCE on commonly
+used benchmark datasets, and extensive experiments show that it achieves
+state-of-the-art qualitative and quantitative performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RefSAM: Efficiently Adapting Segmenting Anything Model for Referring
+  Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00997v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00997v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonglin Li, Jing Zhang, Xiao Teng, Long Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Segment Anything Model (SAM) has gained significant attention for its
+impressive performance in image segmentation. However, it lacks proficiency in
+referring video object segmentation (RVOS) due to the need for precise
+user-interactive prompts and a limited understanding of different modalities,
+such as language and vision. This paper presents the RefSAM model, which
+explores the potential of SAM for RVOS by incorporating multi-view information
+from diverse modalities and successive frames at different timestamps in an
+online manner. Our proposed approach adapts the original SAM model to enhance
+cross-modality learning by employing a lightweight Cross-Modal MLP that
+projects the text embedding of the referring expression into sparse and dense
+embeddings, serving as user-interactive prompts. Additionally, we have
+introduced the hierarchical dense attention module to fuse hierarchical visual
+semantic information with sparse embeddings in order to obtain fine-grained
+dense embeddings, and an implicit tracking module to generate a track token and
+provide historical information for the mask decoder. Furthermore, we employ a
+parameter-efficient tuning strategy to effectively align and fuse the language
+and vision features. Through comprehensive ablation studies, we demonstrate the
+practical and effective design choices of our model. Extensive experiments
+conducted on Ref-Youtu-VOS, Ref-DAVIS17, and three referring image segmentation
+datasets validate the superiority and effectiveness of our RefSAM model over
+existing methods. The code and models will be made publicly at
+\href{https://github.com/LancasterLi/RefSAM}{github.com/LancasterLi/RefSAM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and models will be made publicly at
+  https://github.com/LancasterLi/RefSAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-grounded Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Lian, Baifeng Shi, Adam Yala, Trevor Darrell, Boyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-conditioned diffusion models have emerged as a promising tool for neural
+video generation. However, current models still struggle with intricate
+spatiotemporal prompts and often generate restricted or incorrect motion (e.g.,
+even lacking the ability to be prompted for objects moving from left to right).
+To address these limitations, we introduce LLM-grounded Video Diffusion (LVD).
+Instead of directly generating videos from the text inputs, LVD first leverages
+a large language model (LLM) to generate dynamic scene layouts based on the
+text inputs and subsequently uses the generated layouts to guide a diffusion
+model for video generation. We show that LLMs are able to understand complex
+spatiotemporal dynamics from text alone and generate layouts that align closely
+with both the prompts and the object motion patterns typically observed in the
+real world. We then propose to guide video diffusion models with these layouts
+by adjusting the attention maps. Our approach is training-free and can be
+integrated into any video diffusion model that admits classifier guidance. Our
+results demonstrate that LVD significantly outperforms its base video diffusion
+model and several strong baseline methods in faithfully generating videos with
+the desired attributes and motion patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://llm-grounded-video-diffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Bayesian Augmentation for Single-Source Domain
+  Generalization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Cheng, Tejas Gokhale, Yezhou Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalizing to unseen image domains is a challenging problem primarily due
+to the lack of diverse training data, inaccessible target data, and the large
+domain shift that may exist in many real-world settings. As such data
+augmentation is a critical component of domain generalization methods that seek
+to address this problem. We present Adversarial Bayesian Augmentation (ABA), a
+novel algorithm that learns to generate image augmentations in the challenging
+single-source domain generalization setting. ABA draws on the strengths of
+adversarial learning and Bayesian neural networks to guide the generation of
+diverse data augmentations -- these synthesized image domains aid the
+classifier in generalizing to unseen domains. We demonstrate the strength of
+ABA on several types of domain shift including style shift, subpopulation
+shift, and shift in the medical imaging setting. ABA outperforms all previous
+state-of-the-art methods, including pre-specified augmentations, pixel-based
+and convolutional-based augmentations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Superpixel <span class="highlight-title">Transformer</span>s for Efficient Semantic Segmentation <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16889v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16889v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Zihao Zhu, Jieru Mei, Siyuan Qiao, Hang Yan, Yukun Zhu, Liang-Chieh Chen, Henrik Kretzschmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation, which aims to classify every pixel in an image, is a
+key task in machine perception, with many applications across robotics and
+autonomous driving. Due to the high dimensionality of this task, most existing
+approaches use local operations, such as convolutions, to generate per-pixel
+features. However, these methods are typically unable to effectively leverage
+global context information due to the high computational costs of operating on
+a dense image. In this work, we propose a solution to this issue by leveraging
+the idea of superpixels, an over-segmentation of the image, and applying them
+with a modern transformer framework. In particular, our model learns to
+decompose the pixel space into a spatially low dimensional superpixel space via
+a series of local cross-attentions. We then apply multi-head self-attention to
+the superpixels to enrich the superpixel features with global context and then
+directly produce a class prediction for each superpixel. Finally, we directly
+project the superpixel class predictions back into the pixel space using the
+associations between the superpixels and the image pixel features. Reasoning in
+the superpixel space allows our method to be substantially more computationally
+efficient compared to convolution-based decoder methods. Yet, our method
+achieves state-of-the-art performance in semantic segmentation due to the rich
+superpixel features generated by the global self-attention mechanism. Our
+experiments on Cityscapes and ADE20K demonstrate that our method matches the
+state of the art in terms of accuracy, while outperforming in terms of model
+parameters and latency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 4 tables. Presented at IROS 2023. Equal
+  contribution by A. Zhu and J. Mei</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empirical Analysis of a Segmentation Foundation Model in Prostate
+  Imaging <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03266v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03266v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heejong Kim, Victor Ion Butoi, Adrian V. Dalca, Daniel J. A. Margolis, Mert R. Sabuncu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most state-of-the-art techniques for medical image segmentation rely on
+deep-learning models. These models, however, are often trained on
+narrowly-defined tasks in a supervised fashion, which requires expensive
+labeled datasets. Recent advances in several machine learning domains, such as
+natural language generation have demonstrated the feasibility and utility of
+building foundation models that can be customized for various downstream tasks
+with little to no labeled data. This likely represents a paradigm shift for
+medical imaging, where we expect that foundation models may shape the future of
+the field. In this paper, we consider a recently developed foundation model for
+medical image segmentation, UniverSeg. We conduct an empirical evaluation study
+in the context of prostate imaging and compare it against the conventional
+approach of training a task-specific segmentation model. Our results and
+discussion highlight several important factors that will likely be important in
+the development and adoption of foundation models for medical image
+segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI MedAGI workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective <span class="highlight-title">Self-supervised</span> <span class="highlight-title">Pre-train</span>ing on Low-compute Networks without
+  Distillation <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.02808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.02808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuwen Tan, Fatemeh Saleh, Brais Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive progress of self-supervised learning (SSL), its
+applicability to low-compute networks has received limited attention. Reported
+performance has trailed behind standard supervised pre-training by a large
+margin, barring self-supervised learning from making an impact on models that
+are deployed on device. Most prior works attribute this poor performance to the
+capacity bottleneck of the low-compute networks and opt to bypass the problem
+through the use of knowledge distillation (KD). In this work, we revisit SSL
+for efficient neural networks, taking a closer at what are the detrimental
+factors causing the practical limitations, and whether they are intrinsic to
+the self-supervised low-compute setting. We find that, contrary to accepted
+knowledge, there is no intrinsic architectural bottleneck, we diagnose that the
+performance bottleneck is related to the model complexity vs regularization
+strength trade-off. In particular, we start by empirically observing that the
+use of local views can have a dramatic impact on the effectiveness of the SSL
+methods. This hints at view sampling being one of the performance bottlenecks
+for SSL on low-capacity networks. We hypothesize that the view sampling
+strategy for large neural networks, which requires matching views in very
+diverse spatial scales and contexts, is too demanding for low-capacity
+architectures. We systematize the design of the view sampling mechanism,
+leading to a new training methodology that consistently improves the
+performance across different SSL methods (e.g. MoCo-v2, SwAV, DINO), different
+low-size networks (e.g. MobileNetV2, ResNet18, ResNet34, ViT-Ti), and different
+tasks (linear probe, object detection, instance segmentation and
+semi-supervised learning). Our best models establish a new state-of-the-art for
+SSL methods on low-compute networks despite not using a KD loss term.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 Camera Ready. Code is publicly available at
+  https://github.com/saic-fi/SSLight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Deep Generative 3D-aware Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14267v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14267v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Xia, Jing-Hao Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have seen remarkable progress in deep learning powered visual
+content creation. This includes deep generative 3D-aware image synthesis, which
+produces high-idelity images in a 3D-consistent manner while simultaneously
+capturing compact surfaces of objects from pure image collections without the
+need for any 3D supervision, thus bridging the gap between 2D imagery and 3D
+reality. The ield of computer vision has been recently captivated by the task
+of deep generative 3D-aware image synthesis, with hundreds of papers appearing
+in top-tier journals and conferences over the past few years (mainly the past
+two years), but there lacks a comprehensive survey of this remarkable and swift
+progress. Our survey aims to introduce new researchers to this topic, provide a
+useful reference for related works, and stimulate future research directions
+through our discussion section. Apart from the presented papers, we aim to
+constantly update the latest relevant papers along with corresponding
+implementations at https://weihaox.github.io/3D-aware-Gen.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Computing Surveys. Project page:
+  https://weihaox.github.io/3D-aware-Gen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ nnSAM: Plug-and-play Segment Anything Model Improves nnUNet Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxiang Li, Bowen Jing, Zihan Li, Jing Wang, You Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent developments of foundation models in computer vision, especially
+the Segment Anything Model (SAM), allow scalable and domain-agnostic image
+segmentation to serve as a general-purpose segmentation tool. In parallel, the
+field of medical image segmentation has benefited significantly from
+specialized neural networks like the nnUNet, which is trained on
+domain-specific datasets and can automatically configure the network to tailor
+to specific segmentation challenges. To combine the advantages of foundation
+models and domain-specific models, we present nnSAM, which synergistically
+integrates the SAM model with the nnUNet model to achieve more accurate and
+robust medical image segmentation. The nnSAM model leverages the powerful and
+robust feature extraction capabilities of SAM, while harnessing the automatic
+configuration capabilities of nnUNet to promote dataset-tailored learning. Our
+comprehensive evaluation of nnSAM model on different sizes of training samples
+shows that it allows few-shot learning, which is highly relevant for medical
+image segmentation where high-quality, annotated data can be scarce and costly
+to obtain. By melding the strengths of both its predecessors, nnSAM positions
+itself as a potential new benchmark in medical image segmentation, offering a
+tool that combines broad applicability with specialized efficiency. The code is
+available at https://github.com/Kent0n-Li/Medical-Image-Segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XVO: Generalized Visual Odometry via Cross-Modal Self-Training <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16772v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16772v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Lai, Zhongkai Shangguan, Jimuyang Zhang, Eshed Ohn-Bar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose XVO, a semi-supervised learning method for training generalized
+monocular Visual Odometry (VO) models with robust off-the-self operation across
+diverse datasets and settings. In contrast to standard monocular VO approaches
+which often study a known calibration within a single dataset, XVO efficiently
+learns to recover relative pose with real-world scale from visual scene
+semantics, i.e., without relying on any known camera parameters. We optimize
+the motion estimation model via self-training from large amounts of
+unconstrained and heterogeneous dash camera videos available on YouTube. Our
+key contribution is twofold. First, we empirically demonstrate the benefits of
+semi-supervised training for learning a general-purpose direct VO regression
+network. Second, we demonstrate multi-modal supervision, including
+segmentation, flow, depth, and audio auxiliary prediction tasks, to facilitate
+generalized representations for the VO task. Specifically, we find audio
+prediction task to significantly enhance the semi-supervised learning process
+while alleviating noisy pseudo-labels, particularly in highly dynamic and
+out-of-domain video data. Our proposed teacher network achieves
+state-of-the-art performance on the commonly used KITTI benchmark despite no
+multi-frame optimization or knowledge of camera parameters. Combined with the
+proposed semi-supervised step, XVO demonstrates off-the-shelf knowledge
+transfer across diverse conditions on KITTI, nuScenes, and Argoverse without
+fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, Paris https://genxvo.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DANI: Fast Diffusion Aware Network Inference with Preserving Topological
+  Structure Property 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maryam Ramezani, Aryan Ahadinia, Erfan Farhadi, Hamid R. Rabiee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fast growth of social networks and their data access limitations in
+recent years has led to increasing difficulty in obtaining the complete
+topology of these networks. However, diffusion information over these networks
+is available, and many algorithms have been proposed to infer the underlying
+networks using this information. The previously proposed algorithms only focus
+on inferring more links and ignore preserving the critical topological
+characteristics of the underlying social networks. In this paper, we propose a
+novel method called DANI to infer the underlying network while preserving its
+structural properties. It is based on the Markov transition matrix derived from
+time series cascades, as well as the node-node similarity that can be observed
+in the cascade behavior from a structural point of view. In addition, the
+presented method has linear time complexity (increases linearly with the number
+of nodes, number of cascades, and square of the average length of cascades),
+and its distributed version in the MapReduce framework is also scalable. We
+applied the proposed approach to both real and synthetic networks. The
+experimental results showed that DANI has higher accuracy and lower run time
+while maintaining structural properties, including modular structure, degree
+distribution, connected components, density, and clustering coefficients, than
+well-known network inference methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:1706.00941</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Efficient and Effective Adaptation of Large Language Models for
+  Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Peng, Ben Burns, Ziqi Chen, Srinivasan Parthasarathy, Xia Ning
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, with large language models (LLMs) achieving state-of-the-art
+performance in context understanding, increasing efforts have been dedicated to
+developing LLM-enhanced sequential recommendation (SR) methods. Considering
+that most existing LLMs are not specifically optimized for recommendation
+tasks, adapting them for SR becomes a critical step in LLM-enhanced SR methods.
+Though numerous adaptation methods have been developed, it still remains a
+significant challenge to adapt LLMs for SR both efficiently and effectively. To
+address this challenge, in this paper, we introduce a novel side sequential
+network adaptation method, denoted as SSNA, for LLM enhanced SR. SSNA features
+three key designs to allow both efficient and effective LLM adaptation. First,
+SSNA learns adapters separate from LLMs, while fixing all the pre-trained
+parameters within LLMs to allow efficient adaptation. In addition, SSNA adapts
+the top-a layers of LLMs jointly, and integrates adapters sequentially for
+enhanced effectiveness (i.e., recommendation performance). We compare SSNA
+against five state-of-the-art baseline methods on five benchmark datasets using
+three LLMs. The experimental results demonstrate that SSNA significantly
+outperforms all the baseline methods in terms of recommendation performance,
+and achieves substantial improvement over the best-performing baseline methods
+at both run-time and memory efficiency during training. Our analysis shows the
+effectiveness of integrating adapters in a sequential manner. Our parameter
+study demonstrates the effectiveness of jointly adapting the top-a layers of
+LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causality-informed Rapid Post-hurricane Building Damage Detection in
+  Large Scale from InSAR Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenguang Wang, Yepeng Liu, Xiaojian Zhang, Xuechun Li, Vladimir Paramygin, Arthriya Subgranon, Peter Sheng, Xilei Zhao, Susu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Timely and accurate assessment of hurricane-induced building damage is
+crucial for effective post-hurricane response and recovery efforts. Recently,
+remote sensing technologies provide large-scale optical or Interferometric
+Synthetic Aperture Radar (InSAR) imagery data immediately after a disastrous
+event, which can be readily used to conduct rapid building damage assessment.
+Compared to optical satellite imageries, the Synthetic Aperture Radar can
+penetrate cloud cover and provide more complete spatial coverage of damaged
+zones in various weather conditions. However, these InSAR imageries often
+contain highly noisy and mixed signals induced by co-occurring or co-located
+building damage, flood, flood/wind-induced vegetation changes, as well as
+anthropogenic activities, making it challenging to extract accurate building
+damage information. In this paper, we introduced an approach for rapid
+post-hurricane building damage detection from InSAR imagery. This approach
+encoded complex causal dependencies among wind, flood, building damage, and
+InSAR imagery using a holistic causal Bayesian network. Based on the causal
+Bayesian network, we further jointly inferred the large-scale unobserved
+building damage by fusing the information from InSAR imagery with prior
+physical models of flood and wind, without the need for ground truth labels.
+Furthermore, we validated our estimation results in a real-world devastating
+hurricane -- the 2022 Hurricane Ian. We gathered and annotated building damage
+ground truth data in Lee County, Florida, and compared the introduced method's
+estimation results with the ground truth and benchmarked it against
+state-of-the-art models to assess the effectiveness of our proposed method.
+Results show that our method achieves rapid and accurate detection of building
+damage, with significantly reduced processing time compared to traditional
+manual inspection methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Replicating Relevance-Ranked Synonym Discovery in a New Language and
+  Domain <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Yates, Michael Unterkalmsteiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain-specific synonyms occur in many specialized search tasks, such as when
+searching medical documents, legal documents, and software engineering
+artifacts. We replicate prior work on ranking domain-specific synonyms in the
+consumer health domain by applying the approach to a new language and domain:
+identifying Swedish language synonyms in the building construction domain. We
+chose this setting because identifying synonyms in this domain is helpful for
+downstream systems, where different users may query for documents (e.g.,
+engineering requirements) using different terminology. We consider two new
+features inspired by the change in language and methodological advances since
+the prior work's publication. An evaluation using data from the building
+construction domain supports the finding from the prior work that synonym
+discovery is best approached as a learning to rank task in which a human editor
+views ranked synonym candidates in order to construct a domain-specific
+thesaurus. We additionally find that FastText embeddings alone provide a strong
+baseline, though they do not perform as well as the strongest learning to rank
+method. Finally, we analyze the performance of individual features and the
+differences in the domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECIR (1) 2019: 429-442</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LEEC: A Legal Element Extraction <span class="highlight-title">Dataset</span> with an Extensive
+  Domain-Specific Label System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Zongyue, Liu Huanghai, Hu Yiran, Kong Kangle, Wang Chenlu, Liu Yun, Shen Weixing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a pivotal task in natural language processing, element extraction has
+gained significance in the legal domain. Extracting legal elements from
+judicial documents helps enhance interpretative and analytical capacities of
+legal cases, and thereby facilitating a wide array of downstream applications
+in various domains of law. Yet existing element extraction datasets are limited
+by their restricted access to legal knowledge and insufficient coverage of
+labels. To address this shortfall, we introduce a more comprehensive,
+large-scale criminal element extraction dataset, comprising 15,831 judicial
+documents and 159 labels. This dataset was constructed through two main steps:
+First, designing the label system by our team of legal experts based on prior
+legal research which identified critical factors driving and processes
+generating sentencing outcomes in criminal cases; Second, employing the legal
+knowledge to annotate judicial documents according to the label system and
+annotation guideline. The Legal Element ExtraCtion dataset (LEEC) represents
+the most extensive and domain-specific legal element extraction dataset for the
+Chinese legal system. Leveraging the annotated data, we employed various SOTA
+models that validates the applicability of LEEC for Document Event Extraction
+(DEE) task. The LEEC dataset is available on https://github.com/THUlawtech/LEEC .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NewsRecLib: A PyTorch-Lightning Library for Neural News Recommendation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreea Iana, Goran Glavaš, Heiko Paulheim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NewsRecLib is an open-source library based on Pytorch-Lightning and Hydra
+developed for training and evaluating neural news recommendation models. The
+foremost goals of NewsRecLib are to promote reproducible research and rigorous
+experimental evaluation by (i) providing a unified and highly configurable
+framework for exhaustive experimental studies and (ii) enabling a thorough
+analysis of the performance contribution of different model architecture
+components and training regimes. NewsRecLib is highly modular, allows
+specifying experiments in a single configuration file, and includes extensive
+logging facilities. Moreover, NewsRecLib provides out-of-the-box
+implementations of several prominent neural models, training methods, standard
+evaluation benchmarks, and evaluation metrics for news recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2023 Conference on Empirical Methods in Natural
+  Language Processing (EMNLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> Condensation for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Wu, Wenqi Fan, Shengcai Liu, Qijiong Liu, Rui He, Qing Li, Ke Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training recommendation models on large datasets often requires significant
+time and computational resources. Consequently, an emergent imperative has
+arisen to construct informative, smaller-scale datasets for efficiently
+training. Dataset compression techniques explored in other domains show
+potential possibility to address this problem, via sampling a subset or
+synthesizing a small dataset. However, applying existing approaches to condense
+recommendation datasets is impractical due to following challenges: (i)
+sampling-based methods are inadequate in addressing the long-tailed
+distribution problem; (ii) synthesizing-based methods are not applicable due to
+discreteness of interactions and large size of recommendation datasets; (iii)
+neither of them fail to address the specific issue in recommendation of false
+negative items, where items with potential user interest are incorrectly
+sampled as negatives owing to insufficient exposure.
+  To bridge this gap, we investigate dataset condensation for recommendation,
+where discrete interactions are continualized with probabilistic
+re-parameterization. To avoid catastrophically expensive computations, we adopt
+a one-step update strategy for inner model training and introducing policy
+gradient estimation for outer dataset synthesis. To mitigate amplification of
+long-tailed problem, we compensate long-tailed users in the condensed dataset.
+Furthermore, we propose to utilize a proxy model to identify false negative
+items. Theoretical analysis regarding the convergence property is provided.
+Extensive experiments on multiple datasets demonstrate the efficacy of our
+method. In particular, we reduce the dataset size by 75% while approximating
+over 98% of the original performance on Dianping and over 90% on other
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Organized Event Participant Prediction Enhanced by Social Media
+  Retweeting Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Zhang, Takahiro Hara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, many platforms on the Web offer organized events, allowing users to
+be organizers or participants. For such platforms, it is beneficial to predict
+potential event participants. Existing work on this problem tends to borrow
+recommendation techniques. However, compared to e-commerce items and purchases,
+events and participation are usually of a much smaller frequency, and the data
+may be insufficient to learn an accurate model. In this paper, we propose to
+utilize social media retweeting activity data to enhance the learning of event
+participant prediction models. We create a joint knowledge graph to bridge the
+social media and the target domain, assuming that event descriptions and tweets
+are written in the same language. Furthermore, we propose a learning model that
+utilizes retweeting information for the target domain prediction more
+effectively. We conduct comprehensive experiments in two scenarios with
+real-world data. In each scenario, we set up training data of different sizes,
+as well as warm and cold test cases. The evaluation results show that our
+approach consistently outperforms several baseline models, especially with the
+warm test cases, and when target domain data is limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in WI-IAT 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ F0 analysis of Ghanaian pop singing reveals progressive alignment with
+  equal temperament over the past three decades: a case study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iran R. Roman, Daniel Faronbi, Isabelle Burger-Weiser, Leila Adu-Gilmore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary Ghanaian popular singing combines European and traditional
+Ghanaian influences. We hypothesize that access to technology embedded with
+equal temperament catalyzed a progressive alignment of Ghanaian singing with
+equal-tempered scales over time. To test this, we study the Ghanaian singer
+Daddy Lumba, whose work spans from the earliest Ghanaian electronic style in
+the late 1980s to the present. Studying a singular musician as a case study
+allows us to refine our analysis without over-interpreting the findings. We
+curated a collection of his songs, distributed between 1989 and 2016, to
+extract F0 values from isolated vocals. We used Gaussian mixture modeling (GMM)
+to approximate each song's scale and found that the pitch variance has been
+decreasing over time. We also determined whether the GMM components follow the
+arithmetic relationships observed in equal-tempered scales, and observed that
+Daddy Lumba's singing better aligns with equal temperament in recent years.
+Together, results reveal the impact of exposure to equal-tempered scales,
+resulting in lessened microtonal content in Daddy Lumba's singing. Our study
+highlights a potential vulnerability of Ghanaian musical scales and implies a
+need for research that maps and archives singing styles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pages 27-33</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction <span class="highlight-title">Dataset</span> for
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08018v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08018v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xiaozhuan Liang, Ningyu Zhang, Kangwei Liu, Rui Huang, Zhuo Chen, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), with their remarkable task-handling
+capabilities and innovative outputs, have catalyzed significant advancements
+across a spectrum of fields. However, their proficiency within specialized
+domains such as biomolecular studies remains limited. To address this
+challenge, we introduce Mol-Instructions, a comprehensive instruction dataset
+designed for the biomolecular domain. Mol-Instructions encompasses three key
+components: molecule-oriented instructions, protein-oriented instructions, and
+biomolecular text instructions. Each component aims to improve the
+understanding and prediction capabilities of LLMs concerning biomolecular
+features and behaviors. Through extensive instruction tuning experiments on
+LLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large
+models' performance in the intricate realm of biomolecular studies, thus
+fostering progress in the biomolecular research community. Mol-Instructions is
+publicly available for ongoing research and will undergo regular updates to
+enhance its applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project homepage: https://github.com/zjunlp/Mol-Instructions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>-based classification of user queries for medical consultancy
+  with respect to expert specialization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14662v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14662v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitry Lyutkin, Andrey Soloviev, Dmitry Zhukov, Denis Pozdnyakov, Muhammad Shahid Iqbal Malik, Dmitry I. Ignatov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The need for skilled medical support is growing in the era of digital
+healthcare. This research presents an innovative strategy, utilizing the RuBERT
+model, for categorizing user inquiries in the field of medical consultation
+with a focus on expert specialization. By harnessing the capabilities of
+transformers, we fine-tuned the pre-trained RuBERT model on a varied dataset,
+which facilitates precise correspondence between queries and particular medical
+specialisms. Using a comprehensive dataset, we have demonstrated our approach's
+superior performance with an F1-score of over 92%, calculated through both
+cross-validation and the traditional split of test and train datasets. Our
+approach has shown excellent generalization across medical domains such as
+cardiology, neurology and dermatology. This methodology provides practical
+benefits by directing users to appropriate specialists for prompt and targeted
+medical advice. It also enhances healthcare system efficiency, reduces
+practitioner burden, and improves patient care quality. In summary, our
+suggested strategy facilitates the attainment of specific medical knowledge,
+offering prompt and precise advice within the digital healthcare field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">62</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICML 2023 Topological Deep Learning Challenge : Design and Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathilde Papillon, Mustafa Hajij, Florian Frantzen, Josef Hoppe, Helen Jenne, Johan Mathe, Audun Myers, Theodore Papamarkou, Michael T. Schaub, Ghada Zamzmi, Tolga Birdal, Tamal Dey, Tim Doster, Tegan Emerson, Gurusankar Gopalakrishnan, Devendra Govil, Vincent Grande, Aldo Guzmán-Sáenz, Henry Kvinge, Neal Livesay, Jan Meisner, Soham Mukherjee, Shreyas N. Samaga, Karthikeyan Natesan Ramamurthy, Maneel Reddy Karri, Paul Rosen, Sophia Sanborn, Michael Scholkemper, Robin Walters, Jens Agerberg, Georg Bökman, Sadrodin Barikbin, Claudio Battiloro, Gleb Bazhenov, Guillermo Bernardez, Aiden Brent, Sergio Escalera, Simone Fiorellino, Dmitrii Gavrilev, Mohammed Hassanin, Paul Häusner, Odin Hoff Gardaa, Abdelwahed Khamis, Manuel Lecha, German Magai, Tatiana Malygina, Pavlo Melnyk, Rubén Ballester, Kalyan Nadimpalli, Alexander Nikitin, Abraham Rabinowitz, Alessandro Salatiello, Simone Scardapane, Luca Scofano, Suraj Singh, Jens Sjölund, Pavel Snopov, Indro Spinelli, Lev Telyatnikov, Lucia Testa, Maosheng Yang, Yixiao Yue, Olga Zaghen, Ali Zia, Nina Miolane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the computational challenge on topological deep learning
+that was hosted within the ICML 2023 Workshop on Topology and Geometry in
+Machine Learning. The competition asked participants to provide open-source
+implementations of topological neural networks from the literature by
+contributing to the python packages TopoNetX (data processing) and TopoModelX
+(deep learning). The challenge attracted twenty-eight qualifying submissions in
+its two-month duration. This paper describes the design of the challenge and
+summarizes its main findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Counterfactual Fair Model for Longitudinal Electronic Health Records
+  via Deconfounder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11819v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11819v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Liu, Xiaohan Li, Philip Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fairness issue of clinical data modeling, especially on Electronic Health
+Records (EHRs), is of utmost importance due to EHR's complex latent structure
+and potential selection bias. It is frequently necessary to mitigate health
+disparity while keeping the model's overall accuracy in practice. However,
+traditional methods often encounter the trade-off between accuracy and
+fairness, as they fail to capture the underlying factors beyond observed data.
+To tackle this challenge, we propose a novel model called Fair Longitudinal
+Medical Deconfounder (FLMD) that aims to achieve both fairness and accuracy in
+longitudinal Electronic Health Records (EHR) modeling. Drawing inspiration from
+the deconfounder theory, FLMD employs a two-stage training process. In the
+first stage, FLMD captures unobserved confounders for each encounter, which
+effectively represents underlying medical factors beyond observed EHR, such as
+patient genotypes and lifestyle habits. This unobserved confounder is crucial
+for addressing the accuracy/fairness dilemma. In the second stage, FLMD
+combines the learned latent representation with other relevant features to make
+predictions. By incorporating appropriate fairness criteria, such as
+counterfactual fairness, FLMD ensures that it maintains high prediction
+accuracy while simultaneously minimizing health disparities. We conducted
+comprehensive experiments on two real-world EHR datasets to demonstrate the
+effectiveness of FLMD. Apart from the comparison of baseline methods and FLMD
+variants in terms of fairness and accuracy, we assessed the performance of all
+models on disturbed/imbalanced and synthetic datasets to showcase the
+superiority of FLMD across different settings and provide valuable insights
+into its capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning a Generic Value-Selection Heuristic Inside a Constraint
+  Programming Solver 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01913v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01913v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Marty, Tristan François, Pierre Tessier, Louis Gauthier, Louis-Martin Rousseau, Quentin Cappart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constraint programming is known for being an efficient approach for solving
+combinatorial problems. Important design choices in a solver are the branching
+heuristics, which are designed to lead the search to the best solutions in a
+minimum amount of time. However, developing these heuristics is a
+time-consuming process that requires problem-specific expertise. This
+observation has motivated many efforts to use machine learning to automatically
+learn efficient heuristics without expert intervention. To the best of our
+knowledge, it is still an open research question. Although several generic
+variable-selection heuristics are available in the literature, the options for
+a generic value-selection heuristic are more scarce. In this paper, we propose
+to tackle this issue by introducing a generic learning procedure that can be
+used to obtain a value-selection heuristic inside a constraint programming
+solver. This has been achieved thanks to the combination of a deep Q-learning
+algorithm, a tailored reward signal, and a heterogeneous graph neural network
+architecture. Experiments on graph coloring, maximum independent set, and
+maximum cut problems show that our framework is able to find better solutions
+close to optimality without requiring a large amounts of backtracks while being
+generic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Unsupervised Method for Estimating Class Separability of <span class="highlight-title">Dataset</span>s
+  with Application to LLMs Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15016v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15016v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Najah Ghalyan, Kostis Gourgoulias, Yash Satsangi, Sean Moran, Maxime Labonne, Joseph Sabelja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes an unsupervised method that leverages topological
+characteristics of data manifolds to estimate class separability of the data
+without requiring labels. Experiments conducted in this paper on several
+datasets demonstrate a clear correlation and consistency between the class
+separability estimated by the proposed method with supervised metrics like
+Fisher Discriminant Ratio~(FDR) and cross-validation of a classifier, which
+both require labels. This can enable implementing learning paradigms aimed at
+learning from both labeled and unlabeled data, like semi-supervised and
+transductive learning. This would be particularly useful when we have limited
+labeled data and a relatively large unlabeled dataset that can be used to
+enhance the learning process. The proposed method is implemented for language
+model fine-tuning with automated stopping criterion by monitoring class
+separability of the embedding-space manifold in an unsupervised setting. The
+proposed methodology has been first validated on synthetic data, where the
+results show a clear consistency between class separability estimated by the
+proposed method and class separability computed by FDR. The method has been
+also implemented on both public and internal data. The results show that the
+proposed method can effectively aid -- without the need for labels -- a
+decision on when to stop or continue the fine-tuning of a language model and
+which fine-tuning iteration is expected to achieve a maximum classification
+performance through quantification of the class separability of the embedding
+manifold.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sample-efficient Model-based Reinforcement Learning for Quantum Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09718v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09718v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irtaza Khalid, Carrie A. Weidner, Edmond A. Jonckheere, Sophie G. Shermer, Frank C. Langbein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a model-based reinforcement learning (RL) approach for noisy
+time-dependent gate optimization with improved sample complexity over
+model-free RL. Sample complexity is the number of controller interactions with
+the physical system. Leveraging an inductive bias, inspired by recent advances
+in neural ordinary differential equations (ODEs), we use an auto-differentiable
+ODE parametrised by a learnable Hamiltonian ansatz to represent the model
+approximating the environment whose time-dependent part, including the control,
+is fully known. Control alongside Hamiltonian learning of continuous
+time-independent parameters is addressed through interactions with the system.
+We demonstrate an order of magnitude advantage in the sample complexity of our
+method over standard model-free RL in preparing some standard unitary gates
+with closed and open system dynamics, in realistic numerical experiments
+incorporating single shot measurements, arbitrary Hilbert space truncations and
+uncertainty in Hamiltonian parameters. Also, the learned Hamiltonian can be
+leveraged by existing control methods like GRAPE for further gradient-based
+optimization with the controllers found by RL as initializations. Our algorithm
+that we apply on nitrogen vacancy (NV) centers and transmons in this paper is
+well suited for controlling partially characterised one and two qubit systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14+10 pages, 6+6 figures, revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MUBen: Benchmarking the Uncertainty of Molecular Representation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10060v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10060v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Li, Lingkai Kong, Yuanqi Du, Yue Yu, Yuchen Zhuang, Wenhao Mu, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large molecular representation models pre-trained on massive unlabeled data
+have shown great success in predicting molecular properties. However, these
+models may tend to overfit the fine-tuning data, resulting in over-confident
+predictions on test data that fall outside of the training distribution. To
+address this issue, uncertainty quantification (UQ) methods can be used to
+improve the models' calibration of predictions. Although many UQ approaches
+exist, not all of them lead to improved performance. While some studies have
+included UQ to improve molecular pre-trained models, the process of selecting
+suitable backbone and UQ methods for reliable molecular uncertainty estimation
+remains underexplored. To address this gap, we present MUBen, which evaluates
+different UQ methods for state-of-the-art backbone molecular representation
+models to investigate their capabilities. By fine-tuning various backbones
+using different molecular descriptors as inputs with UQ methods from different
+categories, we critically assess the influence of architectural decisions and
+training strategies. Our study offers insights for selecting UQ for backbone
+models, which can facilitate research on uncertainty-critical applications in
+fields such as materials science and drug discovery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14945v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14945v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Ye Tan, Stanley Osher, Wuchen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of sampling from a distribution governed by a
+potential function. This work proposes an explicit score based MCMC method that
+is deterministic, resulting in a deterministic evolution for particles rather
+than a stochastic differential equation evolution. The score term is given in
+closed form by a regularized Wasserstein proximal, using a kernel convolution
+that is approximated by sampling. We demonstrate fast convergence on various
+problems and show improved dimensional dependence of mixing time bounds for the
+case of Gaussian distributions compared to the unadjusted Langevin algorithm
+(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally
+derive closed form expressions for the distributions at each iterate for
+quadratic potential functions, characterizing the variance reduction. Empirical
+results demonstrate that the particles behave in an organized manner, lying on
+level set contours of the potential. Moreover, the posterior mean estimator of
+the proposed method is shown to be closer to the maximum a-posteriori estimator
+compared to ULA and MALA in the context of Bayesian logistic regression.
+Additional examples demonstrate competitive performance for Bayesian neural
+network training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DADO -- Low-Cost Query Strategies for Deep Active Design Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jens Decke, Christian Gruhl, Lukas Rauch, Bernhard Sick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this experience report, we apply deep active learning to the field of
+design optimization to reduce the number of computationally expensive numerical
+simulations. We are interested in optimizing the design of structural
+components, where the shape is described by a set of parameters. If we can
+predict the performance based on these parameters and consider only the
+promising candidates for simulation, there is an enormous potential for saving
+computing power. We present two selection strategies for self-optimization to
+reduce the computational cost in multi-objective design optimization problems.
+Our proposed methodology provides an intuitive approach that is easy to apply,
+offers significant improvements over random sampling, and circumvents the need
+for uncertainty estimation. We evaluate our strategies on a large dataset from
+the domain of fluid dynamics and introduce two new evaluation metrics to
+determine the model's performance. Findings from our evaluation highlights the
+effectiveness of our selection strategies in accelerating design optimization.
+We believe that the introduced method is easily transferable to other
+self-optimization problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding the limitations of <span class="highlight-title">self-supervised</span> learning for tabular
+  anomaly detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kimberly T. Mai, Toby Davies, Lewis D. Griffin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While self-supervised learning has improved anomaly detection in computer
+vision and natural language processing, it is unclear whether tabular data can
+benefit from it. This paper explores the limitations of self-supervision for
+tabular anomaly detection. We conduct several experiments spanning various
+pretext tasks on 26 benchmark datasets to understand why this is the case. Our
+results confirm representations derived from self-supervision do not improve
+tabular anomaly detection performance compared to using the raw representations
+of the data. We show this is due to neural networks introducing irrelevant
+features, which reduces the effectiveness of anomaly detectors. However, we
+demonstrate that using a subspace of the neural network's representation can
+recover performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Escaping the Sample Trap: Fast and Accurate Epistemic Uncertainty
+  Estimation with Pairwise-Distance Estimators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Berry, David Meger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, the ability to assess uncertainty in model predictions
+is crucial for decision-making, safety-critical applications, and model
+generalizability. This work introduces a novel approach for epistemic
+uncertainty estimation for ensemble models using pairwise-distance estimators
+(PaiDEs). These estimators utilize the pairwise-distance between model
+components to establish bounds on entropy, which are then used as estimates for
+information-based criterion. Unlike recent deep learning methods for epistemic
+uncertainty estimation, which rely on sample-based Monte Carlo estimators,
+PaiDEs are able to estimate epistemic uncertainty up to 100 times faster, over
+a larger input space (up to 100 times) and perform more accurately in higher
+dimensions. To validate our approach, we conducted a series of experiments
+commonly used to evaluate epistemic uncertainty estimation: 1D sinusoidal data,
+$\textit{Pendulum-v0}$, $\textit{Hopper-v2}$, $\textit{Ant-v2}$ and
+$\textit{Humanoid-v2}$. For each experimental setting, an Active Learning
+framework was applied to demonstrate the advantages of PaiDEs for epistemic
+uncertainty estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Landscape-Sketch-Step: An AI/ML-Based Metaheuristic for Surrogate
+  Optimization Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Monteiro, Kartik Sau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a new heuristics for global optimization in
+scenarios where extensive evaluations of the cost function are expensive,
+inaccessible, or even prohibitive. The method, which we call
+Landscape-Sketch-and-Step (LSS), combines Machine Learning, Stochastic
+Optimization, and Reinforcement Learning techniques, relying on historical
+information from previously sampled points to make judicious choices of
+parameter values where the cost function should be evaluated at. Unlike
+optimization by Replica Exchange Monte Carlo methods, the number of evaluations
+of the cost function required in this approach is comparable to that used by
+Simulated Annealing, quality that is especially important in contexts like
+high-throughput computing or high-performance computing tasks, where
+evaluations are either computationally expensive or take a long time to be
+performed. The method also differs from standard Surrogate Optimization
+techniques, for it does not construct a surrogate model that aims at
+approximating or reconstructing the objective function. We illustrate our
+method by applying it to low dimensional optimization problems (dimensions 1,
+2, 4, and 8) that mimick known difficulties of minimization on rugged energy
+landscapes often seen in Condensed Matter Physics, where cost functions are
+rugged and plagued with local minima. When compared to classical Simulated
+Annealing, the LSS shows an effective acceleration of the optimization process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Git-hub on
+  https://github.com/rafael-a-monteiro-math/landscape_sketch_and_step/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mask and Restore: Blind Backdoor Defense at Test Time with Masked
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Sun, Lu Pang, Chao Chen, Haibin Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to backdoor attacks, where an adversary
+maliciously manipulates the model behavior through overlaying images with
+special triggers. Existing backdoor defense methods often require accessing a
+few validation data and model parameters, which are impractical in many
+real-world applications, e.g., when the model is provided as a cloud service.
+In this paper, we address the practical task of blind backdoor defense at test
+time, in particular for black-box models. The true label of every test image
+needs to be recovered on the fly from a suspicious model regardless of image
+benignity. We focus on test-time image purification methods that incapacitate
+possible triggers while keeping semantic contents intact. Due to diverse
+trigger patterns and sizes, the heuristic trigger search in image space can be
+unscalable. We circumvent such barrier by leveraging the strong reconstruction
+power of generative models, and propose a framework of Blind Defense with
+Masked AutoEncoder (BDMAE). It detects possible triggers in the token space
+using image structural similarity and label consistency between the test image
+and MAE restorations. The detection results are then refined by considering
+trigger topology. Finally, we fuse MAE restorations adaptively into a purified
+image for making prediction. Our approach is blind to the model architectures,
+trigger patterns and image benignity. Extensive experiments under different
+backdoor settings validate its effectiveness and generalizability. Code is
+available at https://github.com/tsun/BDMAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction <span class="highlight-title">Dataset</span> for
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08018v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08018v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xiaozhuan Liang, Ningyu Zhang, Kangwei Liu, Rui Huang, Zhuo Chen, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), with their remarkable task-handling
+capabilities and innovative outputs, have catalyzed significant advancements
+across a spectrum of fields. However, their proficiency within specialized
+domains such as biomolecular studies remains limited. To address this
+challenge, we introduce Mol-Instructions, a comprehensive instruction dataset
+designed for the biomolecular domain. Mol-Instructions encompasses three key
+components: molecule-oriented instructions, protein-oriented instructions, and
+biomolecular text instructions. Each component aims to improve the
+understanding and prediction capabilities of LLMs concerning biomolecular
+features and behaviors. Through extensive instruction tuning experiments on
+LLMs, we demonstrate the effectiveness of Mol-Instructions in enhancing large
+models' performance in the intricate realm of biomolecular studies, thus
+fostering progress in the biomolecular research community. Mol-Instructions is
+publicly available for ongoing research and will undergo regular updates to
+enhance its applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project homepage: https://github.com/zjunlp/Mol-Instructions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-Agnostic Molecular Generation with Self-feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11259v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11259v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Ningyu Zhang, Zhuo Chen, Lingbing Guo, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of molecules with desired properties has gained tremendous
+popularity, revolutionizing the way scientists design molecular structures and
+providing valuable support for chemical and drug design. However, despite the
+potential of language models in molecule generation, they face numerous
+challenges such as the generation of syntactically or chemically flawed
+molecules, narrow domain focus, and limitations in creating diverse and
+directionally feasible molecules due to a dearth of annotated data or external
+molecular databases. To tackle these challenges, we introduce MolGen, a
+pre-trained molecular language model tailored specifically for molecule
+generation. Through the reconstruction of over 100 million molecular SELFIES,
+MolGen internalizes profound structural and grammatical insights. This is
+further enhanced by domain-agnostic molecular prefix tuning, fostering robust
+knowledge transfer across diverse domains. Importantly, our self-feedback
+paradigm steers the model away from ``molecular hallucinations'', ensuring
+alignment between the model's estimated probabilities and real-world chemical
+preferences. Extensive experiments on well-known benchmarks underscore MolGen's
+optimization capabilities in properties such as penalized logP, QED, and
+molecular docking. Additional analyses affirm its proficiency in accurately
+capturing molecule distributions, discerning intricate structural patterns, and
+efficiently exploring the chemical space. Code is available at
+https://github.com/zjunlp/MolGen.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Two-Layer Neural Networks Learn, One (Giant) Step at a Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18270v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18270v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yatin Dandi, Florent Krzakala, Bruno Loureiro, Luca Pesce, Ludovic Stephan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate theoretically how the features of a two-layer neural network
+adapt to the structure of the target function through a few large batch
+gradient descent steps, leading to improvement in the approximation capacity
+with respect to the initialization. We compare the influence of batch size and
+that of multiple (but finitely many) steps. For a single gradient step, a batch
+of size $n = \mathcal{O}(d)$ is both necessary and sufficient to align with the
+target function, although only a single direction can be learned. In contrast,
+$n = \mathcal{O}(d^2)$ is essential for neurons to specialize to multiple
+relevant directions of the target with a single gradient step. Even in this
+case, we show there might exist ``hard'' directions requiring $n =
+\mathcal{O}(d^\ell)$ samples to be learned, where $\ell$ is known as the leap
+index of the target. The picture drastically improves over multiple gradient
+steps: we show that a batch-size of $n = \mathcal{O}(d)$ is indeed enough to
+learn multiple target directions satisfying a staircase property, where more
+and more directions can be learned over time. Finally, we discuss how these
+directions allows to drastically improve the approximation capacity and
+generalization error over the initialization, illustrating a separation of
+scale between the random features/lazy regime, and the feature learning regime.
+Our technical analysis leverages a combination of techniques related to
+concentration, projection-based conditioning, and Gaussian equivalence which we
+believe are of independent interest. By pinning down the conditions necessary
+for specialization and learning, our results highlight the interaction between
+batch size and number of iterations, and lead to a hierarchical depiction where
+learning performance exhibits a stairway to accuracy over time and batch size,
+shedding new light on how neural networks adapt to features of the data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-informed neural networks with unknown measurement noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15498v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15498v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Pilar, Niklas Wahlström
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) constitute a flexible approach to
+both finding solutions and identifying parameters of partial differential
+equations. Most works on the topic assume noiseless data, or data contaminated
+by weak Gaussian noise. We show that the standard PINN framework breaks down in
+case of non-Gaussian noise. We give a way of resolving this fundamental issue
+and we propose to jointly train an energy-based model (EBM) to learn the
+correct noise distribution. We illustrate the improved performance of our
+approach using multiple examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Filtering Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17425v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17425v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Fang, Albin Madappally Jose, Amit Jain, Ludwig Schmidt, Alexander Toshev, Vaishaal Shankar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large training sets have become a cornerstone of machine learning and are the
+foundation for recent advances in language modeling and multimodal learning.
+While data curation for pre-training is often still ad-hoc, one common paradigm
+is to first collect a massive pool of data from the Web and then filter this
+candidate pool down to an actual training set via various heuristics. In this
+work, we study the problem of learning a data filtering network (DFN) for this
+second step of filtering a large uncurated dataset. Our key finding is that the
+quality of a network for filtering is distinct from its performance on
+downstream tasks: for instance, a model that performs well on ImageNet can
+yield worse training sets than a model with low ImageNet accuracy that is
+trained on a small amount of high-quality data. Based on our insights, we
+construct new data filtering networks that induce state-of-the-art image-text
+datasets. Specifically, our best performing dataset DFN-5B enables us to train
+state-of-the-art models for their compute budgets: among other improvements on
+a variety of tasks, a ViT-H trained on our dataset achieves 83.0% zero-shot
+transfer accuracy on ImageNet, out-performing models trained on other datasets
+such as LAION-2B, DataComp-1B, or OpenAI's WIT. In order to facilitate further
+research in dataset design, we also release a new 2 billion example dataset
+DFN-2B and show that high performance data filtering networks can be trained
+from scratch using only publicly available data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Driven Near Real-time Locational Marginal Pricing Method: A
+  Feasibility and Robustness Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naga Venkata Sai Jitin Jami, Juraj Kardoš, Olaf Schenk, Harald Köstler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate price predictions are essential for market participants in order to
+optimize their operational schedules and bidding strategies, especially in the
+current context where electricity prices become more volatile and less
+predictable using classical approaches. The Locational Marginal Pricing (LMP)
+pricing mechanism is used in many modern power markets, where the traditional
+approach utilizes optimal power flow (OPF) solvers. However, for large
+electricity grids this process becomes prohibitively time-consuming and
+computationally intensive. Machine learning (ML) based predictions could
+provide an efficient tool for LMP prediction, especially in energy markets with
+intermittent sources like renewable energy. This study evaluates the
+performance of popular machine learning and deep learning models in predicting
+LMP on multiple electricity grids. The accuracy and robustness of these models
+in predicting LMP is assessed considering multiple scenarios. The results show
+that ML models can predict LMP 4-5 orders of magnitude faster than traditional
+OPF solvers with 5-6\% error rate, highlighting the potential of ML models in
+LMP prediction for large-scale power models with the assistance of hardware
+infrastructure like multi-core CPUs and GPUs in modern HPC clusters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Amplitude-Independent Machine Learning for PPG through Visibility Graphs
+  and Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14062v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14062v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyang Miao, Harry J. Davies, Danilo P. Mandic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photoplethysmography (PPG) refers to the measurement of variations in blood
+volume using light and is a feature of most wearable devices. The PPG signals
+provide insight into the body's circulatory system and can be employed to
+extract various bio-features, such as heart rate and vascular ageing. Although
+several algorithms have been proposed for this purpose, many exhibit
+limitations, including heavy reliance on human calibration, high signal quality
+requirements, and a lack of generalisation. In this paper, we introduce a PPG
+signal processing framework that integrates graph theory and computer vision
+algorithms, to provide an analysis framework which is amplitude-independent and
+invariant to affine transformations. It also requires minimal preprocessing,
+fuses information through RGB channels and exhibits robust generalisation
+across tasks and datasets. The proposed VGTL-net achieves state-of-the-art
+performance in the prediction of vascular ageing and demonstrates robust
+estimation of continuous blood pressure waveforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Isotonic Mechanism for Exponential Family Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuling Yan, Weijie J. Su, Jianqing Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 2023, the International Conference on Machine Learning (ICML) required
+authors with multiple submissions to rank their submissions based on perceived
+quality. In this paper, we aim to employ these author-specified rankings to
+enhance peer review in machine learning and artificial intelligence conferences
+by extending the Isotonic Mechanism to exponential family distributions. This
+mechanism generates adjusted scores that closely align with the original scores
+while adhering to author-specified rankings. Despite its applicability to a
+broad spectrum of exponential family distributions, implementing this mechanism
+does not require knowledge of the specific distribution form. We demonstrate
+that an author is incentivized to provide accurate rankings when her utility
+takes the form of a convex additive function of the adjusted review scores. For
+a certain subclass of exponential family distributions, we prove that the
+author reports truthfully only if the question involves only pairwise
+comparisons between her submissions, thus indicating the optimality of ranking
+in truthful information elicitation. Moreover, we show that the adjusted scores
+improve dramatically the estimation accuracy compared to the original scores
+and achieve nearly minimax optimality when the ground-truth scores have bounded
+total variation. We conclude the paper by presenting experiments conducted on
+the ICML 2023 ranking data, which show significant estimation gain using the
+Isotonic Mechanism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evo<span class="highlight-title">Prompt</span>ing: Language Models for Code-Level Neural Architecture Search <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelica Chen, David M. Dohan, David R. So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the recent impressive accomplishments of language models (LMs) for code
+generation, we explore the use of LMs as adaptive mutation and crossover
+operators for an evolutionary neural architecture search (NAS) algorithm. While
+NAS still proves too difficult a task for LMs to succeed at solely through
+prompting, we find that the combination of evolutionary prompt engineering with
+soft prompt-tuning, a method we term EvoPrompting, consistently finds diverse
+and high performing models. We first demonstrate that EvoPrompting is effective
+on the computationally efficient MNIST-1D dataset, where EvoPrompting produces
+convolutional architecture variants that outperform both those designed by
+human experts and naive few-shot prompting in terms of accuracy and model size.
+We then apply our method to searching for graph neural networks on the CLRS
+Algorithmic Reasoning Benchmark, where EvoPrompting is able to design novel
+architectures that outperform current state-of-the-art models on 21 out of 30
+algorithmic reasoning tasks while maintaining similar model size. EvoPrompting
+is successful at designing accurate and efficient neural network architectures
+across a variety of machine learning tasks, while also being general enough for
+easy adaptation to other tasks beyond neural network design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Power of the Weisfeiler-Leman Test for Graph Motif Parameters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Lanzinger, Pablo Barceló
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seminal research in the field of graph neural networks (GNNs) has revealed a
+direct correspondence between the expressive capabilities of GNNs and the
+$k$-dimensional Weisfeiler-Leman ($k$WL) test, a widely-recognized method for
+verifying graph isomorphism. This connection has reignited interest in
+comprehending the specific graph properties effectively distinguishable by the
+$k$WL test. A central focus of research in this field revolves around
+determining the least dimensionality $k$, for which $k$WL can discern graphs
+with different number of occurrences of a pattern graph $P$. We refer to such a
+least $k$ as the WL-dimension of this pattern counting problem. This inquiry
+traditionally delves into two distinct counting problems related to patterns:
+subgraph counting and induced subgraph counting. Intriguingly, despite their
+initial appearance as separate challenges with seemingly divergent approaches,
+both of these problems are interconnected components of a more comprehensive
+problem: "graph motif parameters". In this paper, we provide a precise
+characterization of the WL-dimension of labeled graph motif parameters. As
+specific instances of this result, we obtain characterizations of the
+WL-dimension of the subgraph counting and induced subgraph counting problem for
+every labeled pattern $P$. We additionally demonstrate that in cases where the
+$k$WL test distinguishes between graphs with varying occurrences of a pattern
+$P$, the exact number of occurrences of $P$ can be computed uniformly using
+only local information of the last layer of a corresponding GNN. We finally
+delve into the challenge of recognizing the WL-dimension of various graph
+parameters. We give a polynomial time algorithm for determining the
+WL-dimension of the subgraph counting problem for given pattern $P$, answering
+an open question from previous work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SGD Finds then Tunes Features in Two-Layer Neural Networks with
+  near-Optimal Sample Complexity: A Case Study in the XOR problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15111v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15111v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Margalit Glasgow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we consider the optimization process of minibatch stochastic
+gradient descent (SGD) on a 2-layer neural network with data separated by a
+quadratic ground truth function. We prove that with data drawn from the
+$d$-dimensional Boolean hypercube labeled by the quadratic ``XOR'' function $y
+= -x_ix_j$, it is possible to train to a population error $o(1)$ with $d
+\:\text{polylog}(d)$ samples. Our result considers simultaneously training both
+layers of the two-layer-neural network with ReLU activations via standard
+minibatch SGD on the logistic loss. To our knowledge, this work is the first to
+give a sample complexity of $\tilde{O}(d)$ for efficiently learning the XOR
+function on isotropic data on a standard neural network with standard training.
+Our main technique is showing that the network evolves in two phases: a
+$\textit{signal-finding}$ phase where the network is small and many of the
+neurons evolve independently to find features, and a $\textit{signal-heavy}$
+phase, where SGD maintains and balances the features. We leverage the
+simultaneous training of the layers to show that it is sufficient for only a
+small fraction of the neurons to learn features, since those neurons will be
+amplified by the simultaneous growth of their second layer weights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Backorder Prediction in Inventory Management: Classification Techniques
+  and Cost Considerations <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13837v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13837v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarit Maitra, Sukanya Kundu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces an advanced analytical approach for predicting
+backorders in inventory management. Backorder refers to an order that cannot be
+immediately fulfilled due to stock depletion. Multiple classification
+techniques, including Balanced Bagging Classifiers, Fuzzy Logic, Variational
+Autoencoder - Generative Adversarial Networks, and Multi-layer Perceptron
+classifiers, are assessed in this work using performance evaluation metrics
+such as ROC-AUC and PR-AUC. Moreover, this work incorporates a profit function
+and misclassification costs, considering the financial implications and costs
+associated with inventory management and backorder handling. The results
+demonstrate the effectiveness of the predictive model in enhancing inventory
+system service levels, which leads to customer satisfaction and overall
+organizational performance. Considering interpretability is a significant
+aspect of using AI in commercial applications, permutation importance is
+applied to the selected model to determine the importance of features. This
+research contributes to the advancement of predictive analytics and offers
+valuable insights for future investigations in backorder forecasting and
+inventory control optimization for decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, IEEE (ICSEC 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-guided Entropic Neural Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06094v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06094v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petr Mokrov, Alexander Korotin, Alexander Kolesov, Nikita Gushchin, Evgeny Burnaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy-based models (EBMs) are known in the Machine Learning community for
+decades. Since the seminal works devoted to EBMs dating back to the noughties,
+there have been a lot of efficient methods which solve the generative modelling
+problem by means of energy potentials (unnormalized likelihood functions). In
+contrast, the realm of Optimal Transport (OT) and, in particular, neural OT
+solvers is much less explored and limited by few recent works (excluding
+WGAN-based approaches which utilize OT as a loss function and do not model OT
+maps themselves). In our work, we bridge the gap between EBMs and
+Entropy-regularized OT. We present a novel methodology which allows utilizing
+the recent developments and technical improvements of the former in order to
+enrich the latter. From the theoretical perspective, we prove generalization
+bounds for our technique. In practice, we validate its applicability in toy 2D
+and image domains. To showcase the scalability, we empower our method with a
+pre-trained StyleGAN and apply it to high-res AFHQ $512\times 512$ unpaired I2I
+translation. For simplicity, we choose simple short- and long-run EBMs as a
+backbone of our Energy-guided Entropic OT approach, leaving the application of
+more sophisticated EBMs for future research. Our code is publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Total-Recon: Deformable Scene Reconstruction for Embodied View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chonghyuk Song, Gengshan Yang, Kangle Deng, Jun-Yan Zhu, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the task of embodied view synthesis from monocular videos of
+deformable scenes. Given a minute-long RGBD video of people interacting with
+their pets, we render the scene from novel camera trajectories derived from the
+in-scene motion of actors: (1) egocentric cameras that simulate the point of
+view of a target actor and (2) 3rd-person cameras that follow the actor.
+Building such a system requires reconstructing the root-body and articulated
+motion of every actor, as well as a scene representation that supports
+free-viewpoint synthesis. Longer videos are more likely to capture the scene
+from diverse viewpoints (which helps reconstruction) but are also more likely
+to contain larger motions (which complicates reconstruction). To address these
+challenges, we present Total-Recon, the first method to photorealistically
+reconstruct deformable scenes from long monocular RGBD videos. Crucially, to
+scale to long videos, our method hierarchically decomposes the scene into the
+background and objects, whose motion is decomposed into carefully initialized
+root-body motion and local articulations. To quantify such "in-the-wild"
+reconstruction and view synthesis, we collect ground-truth data from a
+specialized stereo RGBD capture rig for 11 challenging videos, significantly
+outperforming prior methods. Our code, model, and data can be found at
+https://andrewsonga.github.io/totalrecon .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 camera-ready version. Project page with code, models, and
+  data: https://andrewsonga.github.io/totalrecon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Passive learning of active causal strategies in agents and language
+  models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16183v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16183v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Kyle Lampinen, Stephanie C Y Chan, Ishita Dasgupta, Andrew J Nam, Jane X Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What can be learned about causality and experimentation from passive data?
+This question is salient given recent successes of passively-trained language
+models in interactive domains such as tool use. Passive learning is inherently
+limited. However, we show that purely passive learning can in fact allow an
+agent to learn generalizable strategies for determining and using causal
+structures, as long as the agent can intervene at test time. We formally
+illustrate that learning a strategy of first experimenting, then seeking goals,
+can allow generalization from passive learning in principle. We then show
+empirically that agents trained via imitation on expert data can indeed
+generalize at test time to infer and use causal links which are never present
+in the training data; these agents can also generalize experimentation
+strategies to novel variable sets never observed in training. We then show that
+strategies for causal intervention and exploitation can be generalized from
+passive data even in a more complex environment with high-dimensional
+observations, with the support of natural language explanations. Explanations
+can even allow passive learners to generalize out-of-distribution from
+perfectly-confounded training data. Finally, we show that language models,
+trained only on passive next-word prediction, can generalize causal
+intervention strategies from a few-shot prompt containing examples of
+experimentation, together with explanations and reasoning. These results
+highlight the surprising power of passive learning of active causal strategies,
+and may help to understand the behaviors and capabilities of language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Advances in Neural Information Processing Systems (NeurIPS 2023). 10
+  pages main text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-based Action Concept Spaces Improve Video <span class="highlight-title">Self-Supervised</span>
+  Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10922v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10922v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanchana Ranasinghe, Michael Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent contrastive language image pre-training has led to learning highly
+transferable and robust image representations. However, adapting these models
+to video domains with minimal supervision remains an open problem. We explore a
+simple step in that direction, using language tied self-supervised learning to
+adapt an image CLIP model to the video domain. A backbone modified for temporal
+modeling is trained under self-distillation settings with train objectives
+operating in an action concept space. Feature vectors of various action
+concepts extracted from a language encoder using relevant textual prompts
+construct this space. We introduce two train objectives, concept distillation
+and concept alignment, that retain generality of original representations while
+enforcing relations between actions and their attributes. Our approach improves
+zero-shot and linear probing performance on three action recognition
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling MLPs: A Tale of Inductive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregor Bachmann, Sotiris Anagnostidis, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we revisit the most fundamental building block in deep learning,
+the multi-layer perceptron (MLP), and study the limits of its performance on
+vision tasks. Empirical insights into MLPs are important for multiple reasons.
+(1) Given the recent narrative "less inductive bias is better", popularized due
+to transformers eclipsing convolutional models, it is natural to explore the
+limits of this hypothesis. To that end, MLPs offer an ideal test bed, as they
+lack any vision-specific inductive bias. (2) MLPs have almost exclusively been
+the main protagonist in the deep learning theory literature due to their
+mathematical simplicity, serving as a proxy to explain empirical phenomena
+observed for more complex architectures. Surprisingly, experimental datapoints
+for MLPs are very difficult to find in the literature, especially when coupled
+with large pre-training protocols. This discrepancy between practice and theory
+is worrying: Do MLPs reflect the empirical advances exhibited by practical
+models? Or do theorists need to rethink the role of MLPs as a proxy? We provide
+insights into both these aspects. We show that the performance of MLPs
+drastically improves with scale (94% on CIFAR10, 81% on CIFAR100, 58% on
+ImageNet ReaL), highlighting that lack of inductive bias can indeed be
+compensated. We observe that MLPs mimic the behaviour of their modern
+counterparts faithfully, with some components in the learning setting however
+exhibiting stronger or unexpected behaviours. Due to their inherent
+computational efficiency, large pre-training experiments become more accessible
+for academic researchers. All of our experiments were run on a single GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Variational Autoencoders for normative modelling across
+  multiple imaging modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12706v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12706v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ana Lawry Aguila, James Chapman, Andre Altmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the challenges of studying common neurological disorders is disease
+heterogeneity including differences in causes, neuroimaging characteristics,
+comorbidities, or genetic variation. Normative modelling has become a popular
+method for studying such cohorts where the 'normal' behaviour of a
+physiological system is modelled and can be used at subject level to detect
+deviations relating to disease pathology. For many heterogeneous diseases, we
+expect to observe abnormalities across a range of neuroimaging and biological
+variables. However, thus far, normative models have largely been developed for
+studying a single imaging modality. We aim to develop a multi-modal normative
+modelling framework where abnormality is aggregated across variables of
+multiple modalities and is better able to detect deviations than uni-modal
+baselines. We propose two multi-modal VAE normative models to detect subject
+level deviations across T1 and DTI data. Our proposed models were better able
+to detect diseased individuals, capture disease severity, and correlate with
+patient cognition than baseline approaches. We also propose a multivariate
+latent deviation metric, measuring deviations from the joint latent space,
+which outperformed feature-based metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trade-off Between Efficiency and Precision of Neural Abstraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Edwards, Mirco Giacobbe, Alessandro Abate
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural abstractions have been recently introduced as formal approximations of
+complex, nonlinear dynamical models. They comprise a neural ODE and a certified
+upper bound on the error between the abstract neural network and the concrete
+dynamical model. So far neural abstractions have exclusively been obtained as
+neural networks consisting entirely of $ReLU$ activation functions, resulting
+in neural ODE models that have piecewise affine dynamics, and which can be
+equivalently interpreted as linear hybrid automata. In this work, we observe
+that the utility of an abstraction depends on its use: some scenarios might
+require coarse abstractions that are easier to analyse, whereas others might
+require more complex, refined abstractions. We therefore consider neural
+abstractions of alternative shapes, namely either piecewise constant or
+nonlinear non-polynomial (specifically, obtained via sigmoidal activations). We
+employ formal inductive synthesis procedures to generate neural abstractions
+that result in dynamical models with these semantics. Empirically, we
+demonstrate the trade-off that these different neural abstraction templates
+have vis-a-vis their precision and synthesis time, as well as the time required
+for their safety verification (done via reachability computation). We improve
+existing synthesis techniques to enable abstraction of higher-dimensional
+models, and additionally discuss the abstraction of complex neural ODEs to
+improve the efficiency of reachability analysis for these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared at QEST 2023. Added codebase link; corrected Eq. 11</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressor-Based Classification for Atrial Fibrillation Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Markov, Konstantin Ushenin, Yakov Bozhko, Olga Solovyova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atrial fibrillation (AF) is one of the most common arrhythmias with
+challenging public health implications. Therefore, automatic detection of AF
+episodes on ECG is one of the essential tasks in biomedical engineering. In
+this paper, we applied the recently introduced method of compressor-based text
+classification with gzip algorithm for AF detection (binary classification
+between heart rhythms). We investigated the normalized compression distance
+applied to RR-interval and $\Delta$RR-interval sequences ($\Delta$RR-interval
+is the difference between subsequent RR-intervals). Here, the configuration of
+the k-nearest neighbour classifier, an optimal window length, and the choice of
+data types for compression were analyzed. We achieved good classification
+results while learning on the full MIT-BIH Atrial Fibrillation database, close
+to the best specialized AF detection algorithms (avg. sensitivity = 97.1\%,
+avg. specificity = 91.7\%, best sensitivity of 99.8\%, best specificity of
+97.6\% with fivefold cross-validation). In addition, we evaluated the
+classification performance under the few-shot learning setting. Our results
+suggest that gzip compression-based classification, originally proposed for
+texts, is suitable for biomedical data and quantized continuous stochastic
+sequences in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is sent for review at the IEEE conference, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ L2CEval: Evaluating Language-to-Code Generation Capabilities of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansong Ni, Pengcheng Yin, Yilun Zhao, Martin Riddell, Troy Feng, Rui Shen, Stephen Yin, Ye Liu, Semih Yavuz, Caiming Xiong, Shafiq Joty, Yingbo Zhou, Dragomir Radev, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large language models (LLMs), especially those that are pretrained
+on code, have demonstrated strong capabilities in generating programs from
+natural language inputs in a few-shot or even zero-shot manner. Despite
+promising results, there is a notable lack of a comprehensive evaluation of
+these models language-to-code generation capabilities. Existing studies often
+focus on specific tasks, model architectures, or learning paradigms, leading to
+a fragmented understanding of the overall landscape. In this work, we present
+L2CEval, a systematic evaluation of the language-to-code generation
+capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing,
+math reasoning and Python programming, analyzing the factors that potentially
+affect their performance, such as model size, pretraining data, instruction
+tuning, and different prompting methods. In addition to assessing model
+performance, we measure confidence calibration for the models and conduct human
+evaluations of the output programs. This enables us to identify and analyze the
+typical failure modes across various tasks and models. L2CEval offers a
+comprehensive understanding of the capabilities and limitations of LLMs in
+language-to-code generation. We also release the evaluation framework and all
+model outputs, hoping to lay the groundwork for further future research in this
+domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://l2c-eval.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling the Link Between Image Statistics and Human Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Hepburn, Valero Laparra, Raúl Santos-Rodriguez, Jesús Malo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the 1950s, Barlow and Attneave hypothesised a link between biological
+vision and information maximisation. Following Shannon, information was defined
+using the probability of natural images. A number of physiological and
+psychophysical phenomena have been derived ever since from principles like
+info-max, efficient coding, or optimal denoising. However, it remains unclear
+how this link is expressed in mathematical terms from image probability. First,
+classical derivations were subjected to strong assumptions on the probability
+models and on the behaviour of the sensors. Moreover, the direct evaluation of
+the hypothesis was limited by the inability of the classical image models to
+deliver accurate estimates of the probability. In this work we directly
+evaluate image probabilities using an advanced generative model for natural
+images, and we analyse how probability-related factors can be combined to
+predict human perception via sensitivity of state-of-the-art subjective image
+quality metrics. We use information theory and regression analysis to find a
+combination of just two probability-related factors that achieves 0.8
+correlation with subjective metrics. This probability-based sensitivity is
+psychophysically validated by reproducing the basic trends of the Contrast
+Sensitivity Function, its suprathreshold variation, and trends of the Weber-law
+and masking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Short-length SSVEP data extension by a novel generative adversarial
+  networks based framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05599v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05599v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yudong Pan, Ning Li, Yangsong Zhang, Peng Xu, Dezhong Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steady-state visual evoked potentials (SSVEPs) based brain-computer interface
+(BCI) has received considerable attention due to its high information transfer
+rate (ITR) and available quantity of targets. However, the performance of
+frequency identification methods heavily hinges on the amount of user
+calibration data and data length, which hinders the deployment in real-world
+applications. Recently, generative adversarial networks (GANs)-based data
+generation methods have been widely adopted to create synthetic
+electroencephalography (EEG) data, holds promise to address these issues. In
+this paper, we proposed a GAN-based end-to-end signal transformation network
+for Time-window length Extension, termed as TEGAN. TEGAN transforms
+short-length SSVEP signals into long-length artificial SSVEP signals. By
+incorporating a novel U-Net generator architecture and an auxiliary classifier
+into the network architecture, the TEGAN could produce conditioned features in
+the synthetic data. Additionally, we introduced a two-stage training strategy
+and the LeCam-divergence regularization term to regularize the training process
+of GAN during the network implementation. The proposed TEGAN was evaluated on
+two public SSVEP datasets (a 4-class dataset and a 12-class dataset). With the
+assistance of TEGAN, the performance of traditional frequency recognition
+methods and deep learning-based methods have been significantly improved under
+limited calibration data. And the classification performance gap of various
+frequency recognition methods has been narrowed. This study substantiates the
+feasibility of the proposed method to extend the data length for short-time
+SSVEP signals for developing a high-performance BCI system. The proposed
+GAN-based methods have the great potential of shortening the calibration time
+and cutting down the budget for various real-world BCI-based applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Supervision Adaptation Balancing In-distribution Generalization and
+  Out-of-distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.09380v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.09380v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhilin Zhao, Longbing Cao, Kun-Yu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discrepancy between in-distribution (ID) and out-of-distribution (OOD)
+samples can lead to \textit{distributional vulnerability} in deep neural
+networks, which can subsequently lead to high-confidence predictions for OOD
+samples. This is mainly due to the absence of OOD samples during training,
+which fails to constrain the network properly. To tackle this issue, several
+state-of-the-art methods include adding extra OOD samples to training and
+assign them with manually-defined labels. However, this practice can introduce
+unreliable labeling, negatively affecting ID classification. The distributional
+vulnerability presents a critical challenge for non-IID deep learning, which
+aims for OOD-tolerant ID classification by balancing ID generalization and OOD
+detection. In this paper, we introduce a novel \textit{supervision adaptation}
+approach to generate adaptive supervision information for OOD samples, making
+them more compatible with ID samples. Firstly, we measure the dependency
+between ID samples and their labels using mutual information, revealing that
+the supervision information can be represented in terms of negative
+probabilities across all classes. Secondly, we investigate data correlations
+between ID and OOD samples by solving a series of binary regression problems,
+with the goal of refining the supervision information for more distinctly
+separable ID classes. Our extensive experiments on four advanced network
+architectures, two ID datasets, and eleven diversified OOD datasets demonstrate
+the efficacy of our supervision adaptation approach in improving both ID
+classification and OOD detection capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GLISp-r: A preference-based optimization algorithm with convergence
+  guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.01125v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.01125v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Previtali, Mirko Mazzoleni, Antonio Ferramosca, Fabio Previdi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference-based optimization algorithms are iterative procedures that seek
+the optimal calibration of a decision vector based only on comparisons between
+couples of different tunings. At each iteration, a human decision-maker
+expresses a preference between two calibrations (samples), highlighting which
+one, if any, is better than the other. The optimization procedure must use the
+observed preferences to find the tuning of the decision vector that is most
+preferred by the decision-maker, while also minimizing the number of
+comparisons. In this work, we formulate the preference-based optimization
+problem from a utility theory perspective. Then, we propose GLISp-r, an
+extension of a recent preference-based optimization procedure called GLISp. The
+latter uses a Radial Basis Function surrogate to describe the tastes of the
+decision-maker. Iteratively, GLISp proposes new samples to compare with the
+best calibration available by trading off exploitation of the surrogate model
+and exploration of the decision space. In GLISp-r, we propose a different
+criterion to use when looking for new candidate samples that is inspired by
+MSRS, a popular procedure in the black-box optimization framework. Compared to
+GLISp, GLISp-r is less likely to get stuck on local optima of the
+preference-based optimization problem. We motivate this claim theoretically,
+with a proof of global convergence, and empirically, by comparing the
+performances of GLISp and GLISp-r on several benchmark optimization problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal version available at:
+  https://doi.org/10.1007/s10589-023-00491-2 28 pages, 7 figures and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Likelihood Estimation With One-Way Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09882v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09882v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Ben-Dov, Pravir Singh Gupta, Victoria Abrevaya, Michael J. Black, Partha Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) can produce high-quality samples, but
+do not provide an estimate of the probability density around the samples.
+However, it has been noted that maximizing the log-likelihood within an
+energy-based setting can lead to an adversarial framework where the
+discriminator provides unnormalized density (often called energy). We further
+develop this perspective, incorporate importance sampling, and show that 1)
+Wasserstein GAN performs a biased estimate of the partition function, and we
+propose instead to use an unbiased estimator; and 2) when optimizing for
+likelihood, one must maximize generator entropy. This is hypothesized to
+provide a better mode coverage. Different from previous works, we explicitly
+compute the density of the generated samples. This is the key enabler to
+designing an unbiased estimator of the partition function and computation of
+the generator entropy term. The generator density is obtained via a new type of
+flow network, called one-way flow network, that is less constrained in terms of
+architecture, as it does not require a tractable inverse function. Our
+experimental results show that our method converges faster, produces comparable
+sample quality to GANs with similar architecture, successfully avoids
+over-fitting to commonly used datasets and produces smooth low-dimensional
+latent representations of the training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Collaborative Learning Methods Cost-Effectiveness for
+  Prostate Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucia Innocenti, Michela Antonelli, Francesco Cremonesi, Kenaan Sarhan, Alejandro Granados, Vicky Goh, Sebastien Ourselin, Marco Lorenzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Healthcare data is often split into medium/small-sized collections across
+multiple hospitals and access to it is encumbered by privacy regulations. This
+brings difficulties to use them for the development of machine learning and
+deep learning models, which are known to be data-hungry. One way to overcome
+this limitation is to use collaborative learning (CL) methods, which allow
+hospitals to work collaboratively to solve a task, without the need to
+explicitly share local data.
+  In this paper, we address a prostate segmentation problem from MRI in a
+collaborative scenario by comparing two different approaches: federated
+learning (FL) and consensus-based methods (CBM).
+  To the best of our knowledge, this is the first work in which CBM, such as
+label fusion techniques, are used to solve a problem of collaborative learning.
+In this setting, CBM combine predictions from locally trained models to obtain
+a federated strong learner with ideally improved robustness and predictive
+variance properties.
+  Our experiments show that, in the considered practical scenario, CBMs provide
+equal or better results than FL, while being highly cost-effective. Our results
+demonstrate that the consensus paradigm may represent a valid alternative to FL
+for typical training tasks in medical imaging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HUST bearing: a practical <span class="highlight-title">dataset</span> for ball bearing fault diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nguyen Duc Thuan, Hoang Si Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce a practical dataset named HUST bearing, that
+provides a large set of vibration data on different ball bearings. This dataset
+contains 90 raw vibration data of 6 types of defects (inner crack, outer crack,
+ball crack, and their 2-combinations) on 5 types of bearing at 3 working
+conditions with the sample rate of 51,200 samples per second. We established
+the envelope analysis and order tracking analysis on the introduced dataset to
+allow an initial evaluation of the data. A number of classical machine learning
+classification methods are used to identify bearing faults of the dataset using
+features in different domains. The typical advanced unsupervised transfer
+learning algorithms also perform to observe the transferability of knowledge
+among parts of the dataset. The experimental results of examined methods on the
+dataset gain divergent accuracy up to 100% on classification task and 60-80% on
+unsupervised transfer learning task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are considering some issues in the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LDPC codes: comparing cluster graphs to factor graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.06350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.06350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J du Toit, J du Preez, R Wolhuter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a comparison study between a cluster and factor graph
+representation of LDPC codes. In probabilistic graphical models, cluster graphs
+retain useful dependence between random variables during inference, which are
+advantageous in terms of computational cost, convergence speed, and accuracy of
+marginal probabilities. This study investigates these benefits in the context
+of LDPC codes and shows that a cluster graph representation outperforms the
+traditional factor graph representation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LDPC codes: tracking non-stationary channel noise using sequential
+  variational Bayesian estimates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.07037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.07037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J du Toit, J du Preez, R Wolhuter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a sequential Bayesian learning method for tracking non-stationary
+signal-to-noise ratios in LDPC codes using probabilistic graphical models. We
+represent the LDPC code as a cluster graph using a general purpose cluster
+graph construction algorithm called the layered trees running intersection
+property (LTRIP) algorithm. The channel noise estimator is a global Gamma
+cluster, which we extend to allow for Bayesian tracking of non-stationary noise
+variation. We evaluate our proposed model on real-world 5G drive test data. Our
+results show that our model is capable of tracking non-stationary channel
+noise, which outperforms an LDPC code with a fixed knowledge of the actual
+average channel noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures. arXiv admin note: text overlap with
+  arXiv:2204.06350</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Textbooks Are All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee, Yuanzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce phi-1, a new large language model for code, with significantly
+smaller size than competing models: phi-1 is a Transformer-based model with
+1.3B parameters, trained for 4 days on 8 A100s, using a selection of ``textbook
+quality" data from the web (6B tokens) and synthetically generated textbooks
+and exercises with GPT-3.5 (1B tokens). Despite this small scale, phi-1 attains
+pass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP. It also displays
+surprising emergent properties compared to phi-1-base, our model before our
+finetuning stage on a dataset of coding exercises, and phi-1-small, a smaller
+model with 350M parameters trained with the same pipeline as phi-1 that still
+achieves 45% on HumanEval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages; changed color scheme of plot. fixed minor typos and added
+  couple clarifications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Slingshot Perturbation to Learning in Monotone Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenshi Abe, Kaito Ariu, Mitsuki Sakamoto, Atsushi Iwasaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of learning Nash equilibria in {\it monotone
+games} where the gradient of the payoff functions is monotone in the strategy
+profile space, potentially containing additive noise. The optimistic family of
+learning algorithms, exemplified by optimistic Follow-the-Regularized-Leader
+and optimistic Mirror Descent, successfully achieves last-iterate convergence
+in scenarios devoid of noise, leading the dynamics to a Nash equilibrium. A
+recent emerging trend underscores the promise of the perturbation approach,
+where payoff functions are perturbed based on the distance from an anchoring,
+or {\it slingshot}, strategy. In response, we first establish a unified
+framework for learning equilibria in monotone games, accommodating both full
+and noisy feedback. Second, we construct the convergence rates toward an
+approximated equilibrium, irrespective of noise presence. Thirdly, we introduce
+a twist by updating the slingshot strategy, anchoring the current strategy at
+finite intervals. This innovation empowers us to identify the exact Nash
+equilibrium of the underlying game with guaranteed rates. The proposed
+framework is all-encompassing, integrating existing payoff-perturbed
+algorithms. Finally, empirical demonstrations affirm that our algorithms,
+grounded in this framework, exhibit significantly accelerated convergence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlaceNav: Topological Navigation through Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lauri Suomela, Jussi Kalliola, Harry Edelman, Joni-Kristian Kämäräinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent results suggest that splitting topological navigation into
+robot-independent and robot-specific components improves navigation performance
+by enabling the robot-independent part to be trained with data collected by
+different robot types. However, the navigation methods are still limited by the
+scarcity of suitable training data and suffer from poor computational scaling.
+In this work, we present~\methodname, subdividing the robot-independent part
+into navigation-specific and generic computer vision components. We utilize
+visual place recognition for the subgoal selection of the topological
+navigation pipeline. This makes subgoal selection more efficient and enables
+leveraging large-scale datasets from non-robotics sources, increasing training
+data availability. Bayes filtering, enabled by place recognition, further
+improves navigation performance by increasing the temporal consistency of
+subgoals. Our experimental results verify the design and the new model obtains
+a 76% higher success rate in indoor and 23% higher in outdoor navigation tasks
+with higher computational efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PAGAR: Taming Reward Misalignment in Inverse Reinforcement
+  Learning-Based Imitation Learning with Protagonist Antagonist Guided
+  Adversarial Reward 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01731v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01731v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichao Zhou, Wenchao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many imitation learning (IL) algorithms employ inverse reinforcement learning
+(IRL) to infer the underlying reward function that an expert is implicitly
+optimizing for, based on their demonstrated behaviors. However, a misalignment
+between the inferred reward and the true task objective can result in task
+failures. In this paper, we introduce Protagonist Antagonist Guided Adversarial
+Reward (PAGAR), a semi-supervised reward design paradigm to tackle this reward
+misalignment problem in IRL-based IL. We identify the conditions on the
+candidate reward functions under which PAGAR can guarantee to induce a policy
+that succeeds in the underlying task. Furthermore, we present a practical
+on-and-off policy approach to implement PAGAR in IRL-based IL. Experimental
+results show that our algorithm outperforms competitive baselines on complex IL
+tasks and zero-shot IL tasks in transfer environments with limited
+demonstrations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reverse Diffusion Monte Carlo 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunpeng Huang, Hanze Dong, Yifan Hao, Yian Ma, Tong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficacy of modern generative models is commonly contingent upon the
+precision of score estimation along the diffusion path, with a focus on
+diffusion models and their ability to generate high-quality data samples. This
+study delves into the application of reverse diffusion to Monte Carlo sampling.
+It is shown that score estimation can be transformed into a mean estimation
+problem via the decomposition of the transition kernel. By estimating the mean
+of the posterior distribution, we derive a novel Monte Carlo sampling algorithm
+from the reverse diffusion process, which is distinct from traditional Markov
+Chain Monte Carlo (MCMC) methods. We calculate the error requirements and
+sample size for the posterior distribution, and use the result to derive an
+algorithm that can approximate the target distribution to any desired accuracy.
+Additionally, by estimating the log-Sobolev constant of the posterior
+distribution, we show under suitable conditions the problem of sampling from
+the posterior can be easier than direct sampling from the target distribution
+using traditional MCMC techniques. For Gaussian mixture models, we demonstrate
+that the new algorithm achieves significant improvement over the traditional
+Langevin-style MCMC sampling methods both theoretically and practically. Our
+algorithm offers a new perspective and solution beyond classical MCMC
+algorithms for challenging complex distributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Task Graph offloading via Deep Reinforcement Learning in Mobile Edge
+  Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10569v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10569v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiagang Liu, Yun Mi, Xinyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various mobile applications that comprise dependent tasks are gaining
+widespread popularity and are increasingly complex. These applications often
+have low-latency requirements, resulting in a significant surge in demand for
+computing resources. With the emergence of mobile edge computing (MEC), it
+becomes the most significant issue to offload the application tasks onto
+small-scale devices deployed at the edge of the mobile network for obtaining a
+high-quality user experience. However, since the environment of MEC is dynamic,
+most existing works focusing on task graph offloading, which rely heavily on
+expert knowledge or accurate analytical models, fail to fully adapt to such
+environmental changes, resulting in the reduction of user experience. This
+paper investigates the task graph offloading in MEC, considering the
+time-varying computation capabilities of edge computing devices. To adapt to
+environmental changes, we model the task graph scheduling for computation
+offloading as a Markov Decision Process (MDP). Then, we design a deep
+reinforcement learning algorithm (SATA-DRL) to learn the task scheduling
+strategy from the interaction with the environment, to improve user experience.
+Extensive simulations validate that SATA-DRL is superior to existing strategies
+in terms of reducing average makespan and deadline violation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memorization Through the Lens of Curvature of Loss Function Around
+  Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isha Garg, Deepak Ravikumar, Kaushik Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are over-parameterized and easily overfit the datasets
+they train on. In the extreme case, it has been shown that these networks can
+memorize a training set with fully randomized labels. We propose using the
+curvature of loss function around each training sample, averaged over training
+epochs, as a measure of memorization of the sample. We use this metric to study
+the generalization versus memorization properties of different samples in
+popular image datasets and show that it captures memorization statistics well,
+both qualitatively and quantitatively. We first show that the high curvature
+samples visually correspond to long-tailed, mislabeled, or conflicting samples,
+those that are most likely to be memorized. This analysis helps us find, to the
+best of our knowledge, a novel failure mode on the CIFAR100 and ImageNet
+datasets: that of duplicated images with differing labels. Quantitatively, we
+corroborate the validity of our scores via two methods. First, we validate our
+scores against an independent and comprehensively calculated baseline, by
+showing high cosine similarity with the memorization scores released by Feldman
+and Zhang (2020). Second, we inject corrupted samples which are memorized by
+the network, and show that these are learned with high curvature. To this end,
+we synthetically mislabel a random subset of the dataset. We overfit a network
+to it and show that sorting by curvature yields high AUROC values for
+identifying the corrupted samples. An added advantage of our method is that it
+is scalable, as it requires training only a single network as opposed to the
+thousands trained by the baseline, while capturing the aforementioned failure
+mode that the baseline fails to identify.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-learning Planning and Control Policies Constrained by Differentiable
+  Logic Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01346v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01346v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikang Xiong, Daniel Lawson, Joe Eappen, Ahmed H. Qureshi, Suresh Jagannathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing planning and control policies in robotics is a fundamental task,
+further complicated by factors such as complex logic specifications and
+high-dimensional robot dynamics. This paper presents a novel reinforcement
+learning approach to solving high-dimensional robot navigation tasks with
+complex logic specifications by co-learning planning and control policies.
+Notably, this approach significantly reduces the sample complexity in training,
+allowing us to train high-quality policies with much fewer samples compared to
+existing reinforcement learning algorithms. In addition, our methodology
+streamlines complex specification extraction from map images and enables the
+efficient generation of long-horizon robot motion paths across different map
+layouts. Moreover, our approach also demonstrates capabilities for
+high-dimensional control and avoiding suboptimal policies via policy alignment.
+The efficacy of our approach is demonstrated through experiments involving
+simulated high-dimensional quadruped robot dynamics and a real-world
+differential drive robot (TurtleBot3) under different types of task
+specifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trustworthy Optimization: A Novel Approach to Counter Numerical
+  Instability in 16-bit Neural Network Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16189v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16189v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juyoung Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research, we address critical trustworthiness concerns related to the
+numerical instability observed in 16-bit computations of machine learning
+models. Such instability, particularly when employing popular optimization
+algorithms like RMSProp and Adam, often leads to unreliable training of deep
+neural networks. This not only disrupts the learning process but also poses
+significant challenges in deploying dependable models in real-world
+applications. Our investigation identifies the epsilon hyperparameter as the
+primary source of this instability. A nuanced exploration reveals that subtle
+adjustments to epsilon within 16-bit computations can enhance the reliability
+of RMSProp and Adam, enabling more trustworthy training of 16-bit neural
+networks. We propose a novel, dependable approach that leverages updates from
+the Adam optimizer to bolster the stability of the learning process. Our
+contributions provide deeper insights into optimization challenges in
+low-precision computations and offer solutions to ensure the trustworthiness
+and stability of deep neural network training, paving the way for their
+dependable use in various applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance-guaranteed regularization in maximum likelihood method:
+  Gauge symmetry in Kullback -- Leibler divergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akihisa Ichiki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The maximum likelihood method is the best-known method for estimating the
+probabilities behind the data. However, the conventional method obtains the
+probability model closest to the empirical distribution, resulting in
+overfitting. Then regularization methods prevent the model from being
+excessively close to the wrong probability, but little is known systematically
+about their performance. The idea of regularization is similar to
+error-correcting codes, which obtain optimal decoding by mixing suboptimal
+solutions with an incorrectly received code. The optimal decoding in
+error-correcting codes is achieved based on gauge symmetry. We propose a
+theoretically guaranteed regularization in the maximum likelihood method by
+focusing on a gauge symmetry in Kullback -- Leibler divergence. In our
+approach, we obtain the optimal model without the need to search for
+hyperparameters frequently appearing in regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Meets Adaptive Filtering: A Stein's Unbiased Risk
+  Estimator Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Esmaeilbeig, Mojtaba Soltanalian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits two prominent adaptive filtering algorithms through the
+lens of algorithm unrolling, namely recursive least squares (RLS) and
+equivariant adaptive source separation (EASI), in the context of source
+estimation and separation. Building upon the unrolling methodology, we
+introduce novel task-based deep learning frameworks, denoted as Deep RLS and
+Deep EASI. These architectures transform the iterations of the original
+algorithms into layers of a deep neural network, thereby enabling efficient
+source signal estimation by taking advantage of a training process. To further
+enhance performance, we propose training these deep unrolled networks utilizing
+a loss function grounded on a Stein's unbiased risk estimator (SURE). Our
+empirical evaluations demonstrate the efficacy of this SURE-based approach for
+enhanced source signal estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2011.07458</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LibCity: A Unified Library Towards Efficient and Comprehensive Urban
+  Spatial-Temporal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14343v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14343v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Jiang, Chengkai Han, Wenjun Jiang, Wayne Xin Zhao, Jingyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep learning technology advances and more urban spatial-temporal data
+accumulates, an increasing number of deep learning models are being proposed to
+solve urban spatial-temporal prediction problems. However, there are
+limitations in the existing field, including open-source data being in various
+formats and difficult to use, few papers making their code and data openly
+available, and open-source models often using different frameworks and
+platforms, making comparisons challenging. A standardized framework is urgently
+needed to implement and evaluate these methods. To address these issues, we
+propose LibCity, an open-source library that offers researchers a credible
+experimental tool and a convenient development framework. In this library, we
+have reproduced 65 spatial-temporal prediction models and collected 55
+spatial-temporal datasets, allowing researchers to conduct comprehensive
+experiments conveniently. By enabling fair model comparisons, designing a
+unified data storage format, and simplifying the process of developing new
+models, LibCity is poised to make significant contributions to the
+spatial-temporal prediction field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of https://dl.acm.org/doi/10.1145/3474717.3483923</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Demographic Parity: Redefining Equal Treatment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08040v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08040v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Mougan, Laura State, Antonio Ferrara, Salvatore Ruggieri, Steffen Staab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Liberalism-oriented political philosophy reasons that all individuals should
+be treated equally independently of their protected characteristics. Related
+work in machine learning has translated the concept of \emph{equal treatment}
+into terms of \emph{equal outcome} and measured it as \emph{demographic parity}
+(also called \emph{statistical parity}). Our analysis reveals that the two
+concepts of equal outcome and equal treatment diverge; therefore, demographic
+parity does not faithfully represent the notion of \emph{equal treatment}. We
+propose a new formalization for equal treatment by (i) considering the
+influence of feature values on predictions, such as computed by Shapley values
+decomposing predictions across its features, (ii) defining distributions of
+explanations, and (iii) comparing explanation distributions between populations
+with different protected characteristics. We show the theoretical properties of
+our notion of equal treatment and devise a classifier two-sample test based on
+the AUC of an equal treatment inspector. We study our formalization of equal
+treatment on synthetic and natural data. We release \texttt{explanationspace},
+an open-source Python package with methods and tutorials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Plus Low Rank Matrix Decomposition: A Discrete Optimization
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.12701v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.12701v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Bertsimas, Ryan Cory-Wright, Nicholas A. G. Johnson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the Sparse Plus Low-Rank decomposition problem (SLR), which is the
+problem of decomposing a corrupted data matrix into a sparse matrix of
+perturbations plus a low-rank matrix containing the ground truth. SLR is a
+fundamental problem in Operations Research and Machine Learning which arises in
+various applications, including data compression, latent semantic indexing,
+collaborative filtering, and medical imaging. We introduce a novel formulation
+for SLR that directly models its underlying discreteness. For this formulation,
+we develop an alternating minimization heuristic that computes high-quality
+solutions and a novel semidefinite relaxation that provides meaningful bounds
+for the solutions returned by our heuristic. We also develop a custom
+branch-and-bound algorithm that leverages our heuristic and convex relaxations
+to solve small instances of SLR to certifiable (near) optimality. Given an
+input $n$-by-$n$ matrix, our heuristic scales to solve instances where
+$n=10000$ in minutes, our relaxation scales to instances where $n=200$ in
+hours, and our branch-and-bound algorithm scales to instances where $n=25$ in
+minutes. Our numerical results demonstrate that our approach outperforms
+existing state-of-the-art approaches in terms of rank, sparsity, and
+mean-square error while maintaining a comparable runtime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled <span class="highlight-title">Self-supervised</span> Learning for Non-Homophilous Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03601v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03601v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Xiao, Zhengyu Chen, Zhimeng Guo, Zeyang Zhuang, Suhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of conducting self-supervised learning for
+node representation learning on graphs. Most existing self-supervised learning
+methods assume the graph is homophilous, where linked nodes often belong to the
+same class or have similar features. However, such assumptions of homophily do
+not always hold in real-world graphs. We address this problem by developing a
+decoupled self-supervised learning (DSSL) framework for graph neural networks.
+DSSL imitates a generative process of nodes and links from latent variable
+modeling of the semantic structure, which decouples different underlying
+semantics between different neighborhoods into the self-supervised learning
+process. Our DSSL framework is agnostic to the encoders and does not need
+prefabricated augmentations, thus is flexible to different graphs. To
+effectively optimize the framework, we derive the evidence lower bound of the
+self-supervised objective and develop a scalable training algorithm with
+variational inference. We provide a theoretical analysis to justify that DSSL
+enjoys the better downstream performance. Extensive experiments on various
+types of graph benchmarks demonstrate that our proposed framework can achieve
+better performance compared with competitive baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quasi-optimal Reinforcement Learning with Continuous Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhan Li, Wenzhuo Zhou, Ruoqing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world applications of reinforcement learning (RL) require making
+decisions in continuous action environments. In particular, determining the
+optimal dose level plays a vital role in developing medical treatment regimes.
+One challenge in adapting existing RL algorithms to medical applications,
+however, is that the popular infinite support stochastic policies, e.g.,
+Gaussian policy, may assign riskily high dosages and harm patients seriously.
+Hence, it is important to induce a policy class whose support only contains
+near-optimal actions, and shrink the action-searching area for effectiveness
+and reliability. To achieve this, we develop a novel \emph{quasi-optimal
+learning algorithm}, which can be easily optimized in off-policy settings with
+guaranteed convergence under general function approximations. Theoretically, we
+analyze the consistency, sample complexity, adaptability, and convergence of
+the proposed algorithm. We evaluate our algorithm with comprehensive simulated
+experiments and a dose suggestion real application to Ohio Type 1 diabetes
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributional Shift-Aware Off-Policy Interval Estimation: A Unified
+  Error Quantification Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhuo Zhou, Yuhan Li, Ruoqing Zhu, Annie Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study high-confidence off-policy evaluation in the context of
+infinite-horizon Markov decision processes, where the objective is to establish
+a confidence interval (CI) for the target policy value using only offline data
+pre-collected from unknown behavior policies. This task faces two primary
+challenges: providing a comprehensive and rigorous error quantification in CI
+estimation, and addressing the distributional shift that results from
+discrepancies between the distribution induced by the target policy and the
+offline data-generating process. Motivated by an innovative unified error
+analysis, we jointly quantify the two sources of estimation errors: the
+misspecification error on modeling marginalized importance weights and the
+statistical uncertainty due to sampling, within a single interval. This unified
+framework reveals a previously hidden tradeoff between the errors, which
+undermines the tightness of the CI. Relying on a carefully designed
+discriminator function, the proposed estimator achieves a dual purpose:
+breaking the curse of the tradeoff to attain the tightest possible CI, and
+adapting the CI to ensure robustness against distributional shifts. Our method
+is applicable to time-dependent data without assuming any weak dependence
+conditions via leveraging a local supermartingale/martingale structure.
+Theoretically, we show that our algorithm is sample-efficient, error-robust,
+and provably convergent even in non-linear function approximation settings. The
+numerical performance of the proposed method is examined in synthetic datasets
+and an OhioT1DM mobile health study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Model-Agnostic Graph Neural Network for Integrating Local and Global
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13459v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13459v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhuo Zhou, Annie Qu, Keiland W. Cooper, Norbert Fortin, Babak Shahbaba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have achieved promising performance in a variety
+of graph-focused tasks. Despite their success, existing GNNs suffer from two
+significant limitations: a lack of interpretability in results due to their
+black-box nature, and an inability to learn representations of varying orders.
+To tackle these issues, we propose a novel Model-agnostic Graph Neural Network
+(MaGNet) framework, which is able to sequentially integrate information of
+various orders, extract knowledge from high-order neighbors, and provide
+meaningful and interpretable results by identifying influential compact graph
+structures. In particular, MaGNet consists of two components: an estimation
+model for the latent representation of complex relationships under graph
+topology, and an interpretation model that identifies influential nodes, edges,
+and important node features. Theoretically, we establish the generalization
+error bound for MaGNet via empirical Rademacher complexity, and showcase its
+power to represent layer-wise neighborhood mixing. We conduct comprehensive
+numerical studies using simulated data to demonstrate the superior performance
+of MaGNet in comparison to several state-of-the-art alternatives. Furthermore,
+we apply MaGNet to a real-world case study aimed at extracting task-critical
+information from brain activity data, thereby highlighting its effectiveness in
+advancing scientific research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stackelberg Batch Policy Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhuo Zhou, Annie Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Batch reinforcement learning (RL) defines the task of learning from a fixed
+batch of data lacking exhaustive exploration. Worst-case optimality algorithms,
+which calibrate a value-function model class from logged experience and perform
+some type of pessimistic evaluation under the learned model, have emerged as a
+promising paradigm for batch RL. However, contemporary works on this stream
+have commonly overlooked the hierarchical decision-making structure hidden in
+the optimization landscape. In this paper, we adopt a game-theoretical
+viewpoint and model the policy learning diagram as a two-player general-sum
+game with a leader-follower structure. We propose a novel stochastic
+gradient-based learning algorithm: StackelbergLearner, in which the leader
+player updates according to the total derivative of its objective instead of
+the usual individual gradient, and the follower player makes individual updates
+and ensures transition-consistent pessimistic reasoning. The derived learning
+dynamic naturally lends StackelbergLearner to a game-theoretic interpretation
+and provides a convergence guarantee to differentiable Stackelberg equilibria.
+From a theoretical standpoint, we provide instance-dependent regret bounds with
+general function approximation, which shows that our algorithm can learn a
+best-effort policy that is able to compete against any comparator policy that
+is covered by batch data. Notably, our theoretical regret guarantees only
+require realizability without any data coverage and strong function
+approximation conditions, e.g., Bellman closedness, which is in contrast to
+prior works lacking such guarantees. Through comprehensive experiments, we find
+that our algorithm consistently performs as well or better as compared to
+state-of-the-art methods in batch RL benchmark and real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning a Neuron by a Shallow ReLU Network: Dynamics and Implicit Bias
+  for Correlated Inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitry Chistikov, Matthias Englert, Ranko Lazic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove that, for the fundamental regression task of learning a single
+neuron, training a one-hidden layer ReLU network of any width by gradient flow
+from a small initialisation converges to zero loss and is implicitly biased to
+minimise the rank of network parameters. By assuming that the training points
+are correlated with the teacher neuron, we complement previous work that
+considered orthogonal datasets. Our results are based on a detailed
+non-asymptotic analysis of the dynamics of each hidden neuron throughout the
+training. We also show and characterise a surprising distinction in this
+setting between interpolator networks of minimal rank and those of minimal
+Euclidean norm. Finally we perform a range of numerical experiments, which
+corroborate our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ImagenHub: Standardizing the evaluation of conditional image generation
+  models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Tianle Li, Kai Zhang, Yujie Lu, Xingyu Fu, Wenwen Zhuang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a myriad of conditional image generation and editing models have
+been developed to serve different downstream tasks, including text-to-image
+generation, text-guided image editing, subject-driven image generation,
+control-guided image generation, etc. However, we observe huge inconsistencies
+in experimental conditions: datasets, inference, and evaluation metrics -
+render fair comparisons difficult. This paper proposes ImagenHub, which is a
+one-stop library to standardize the inference and evaluation of all the
+conditional image generation models. Firstly, we define seven prominent tasks
+and curate high-quality evaluation datasets for them. Secondly, we built a
+unified inference pipeline to ensure fair comparison. Thirdly, we design two
+human evaluation scores, i.e. Semantic Consistency and Perceptual Quality,
+along with comprehensive guidelines to evaluate generated images. We train
+expert raters to evaluate the model outputs based on the proposed metrics. Our
+human evaluation achieves a high inter-worker agreement of Krippendorff's alpha
+on 76% models with a value higher than 0.4. We comprehensively evaluated a
+total of around 30 models and observed three key takeaways: (1) the existing
+models' performance is generally unsatisfying except for Text-guided Image
+Generation and Subject-driven Image Generation, with 74% models achieving an
+overall score lower than 0.5. (2) we examined the claims from published papers
+and found 83% of them hold with a few exceptions. (3) None of the existing
+automatic metrics has a Spearman's correlation higher than 0.2 except
+subject-driven image generation. Moving forward, we will continue our efforts
+to evaluate newly published models and update our leaderboard to keep track of
+the progress in conditional image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Brief Yet In-Depth <span class="highlight-title">Survey</span> of Deep Learning-Based Image Watermarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04603v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04603v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhong, Arjon Das, Fahad Alrasheedi, Abdullah Tanvir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a comprehensive survey on deep learning-based image
+watermarking, a technique that entails the invisible embedding and extraction
+of watermarks within a cover image, aiming to offer a seamless blend of
+robustness and adaptability. We navigate the complex landscape of this
+interdisciplinary domain, linking historical foundations, current innovations,
+and prospective developments. Unlike existing literature, our study
+concentrates exclusively on image watermarking with deep learning, delivering
+an in-depth, yet brief analysis enriched by three fundamental contributions.
+First, we introduce a refined categorization, segmenting the field into
+Embedder-Extractor, Deep Networks as a Feature Transformation, and Hybrid
+Methods. This taxonomy, inspired by the varied roles of deep learning across
+studies, is designed to infuse clarity, offering readers technical insights and
+directional guidance. Second, our exploration dives into representative
+methodologies, encapsulating the diverse research directions and inherent
+challenges within each category to provide a consolidated perspective. Lastly,
+we venture beyond established boundaries to outline emerging frontiers,
+offering a detailed insight into prospective research avenues.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-01T00:00:00Z">2023-10-01</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Neurons in <span class="highlight-title">Pretrain</span>ed Text-Only <span class="highlight-title">Transformer</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01544v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01544v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Schwettmann, Neil Chowdhury, Samuel Klein, David Bau, Antonio Torralba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models demonstrate remarkable capacity to generalize representations
+learned in one modality to downstream tasks in other modalities. Can we trace
+this ability to individual neurons? We study the case where a frozen text
+transformer is augmented with vision using a self-supervised visual encoder and
+a single linear projection learned on an image-to-text task. Outputs of the
+projection layer are not immediately decodable into language describing image
+content; instead, we find that translation between modalities occurs deeper
+within the transformer. We introduce a procedure for identifying "multimodal
+neurons" that convert visual representations into corresponding text, and
+decoding the concepts they inject into the model's residual stream. In a series
+of experiments, we show that multimodal neurons operate on specific visual
+concepts across inputs, and have a systematic causal effect on image
+captioning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Oral presentation at ICCV CLVL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Holistic Evaluation of Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, Benjamin Newman, Binhang Yuan, Bobby Yan, Ce Zhang, Christian Cosgrove, Christopher D. Manning, Christopher Ré, Diana Acosta-Navas, Drew A. Hudson, Eric Zelikman, Esin Durmus, Faisal Ladhak, Frieda Rong, Hongyu Ren, Huaxiu Yao, Jue Wang, Keshav Santhanam, Laurel Orr, Lucia Zheng, Mert Yuksekgonul, Mirac Suzgun, Nathan Kim, Neel Guha, Niladri Chatterji, Omar Khattab, Peter Henderson, Qian Huang, Ryan Chi, Sang Michael Xie, Shibani Santurkar, Surya Ganguli, Tatsunori Hashimoto, Thomas Icard, Tianyi Zhang, Vishrav Chaudhary, William Wang, Xuechen Li, Yifan Mai, Yuhui Zhang, Yuta Koreeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs) are becoming the foundation for almost all major
+language technologies, but their capabilities, limitations, and risks are not
+well understood. We present Holistic Evaluation of Language Models (HELM) to
+improve the transparency of language models. First, we taxonomize the vast
+space of potential scenarios (i.e. use cases) and metrics (i.e. desiderata)
+that are of interest for LMs. Then we select a broad subset based on coverage
+and feasibility, noting what's missing or underrepresented (e.g. question
+answering for neglected English dialects, metrics for trustworthiness). Second,
+we adopt a multi-metric approach: We measure 7 metrics (accuracy, calibration,
+robustness, fairness, bias, toxicity, and efficiency) for each of 16 core
+scenarios when possible (87.5% of the time). This ensures metrics beyond
+accuracy don't fall to the wayside, and that trade-offs are clearly exposed. We
+also perform 7 targeted evaluations, based on 26 targeted scenarios, to analyze
+specific aspects (e.g. reasoning, disinformation). Third, we conduct a
+large-scale evaluation of 30 prominent language models (spanning open,
+limited-access, and closed models) on all 42 scenarios, 21 of which were not
+previously used in mainstream LM evaluation. Prior to HELM, models on average
+were evaluated on just 17.9% of the core HELM scenarios, with some prominent
+models not sharing a single scenario in common. We improve this to 96.0%: now
+all 30 models have been densely benchmarked on the same core scenarios and
+metrics under standardized conditions. Our evaluation surfaces 25 top-level
+findings. For full transparency, we release all raw model prompts and
+completions publicly for further analysis, as well as a general modular
+toolkit. We intend for HELM to be a living benchmark for the community,
+continuously updated with new scenarios, metrics, and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Authored by the Center for Research on Foundation Models (CRFM) at
+  the Stanford Institute for Human-Centered Artificial Intelligence (HAI).
+  Project page: https://crfm.stanford.edu/helm/v1.0</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ConvXAI: Delivering Heterogeneous AI Explanations via Conversations to
+  Support Human-AI Scientific Writing <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09770v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09770v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hua Shen, Chieh-Yang Huang, Tongshuang Wu, Ting-Hao 'Kenneth' Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite a surge collection of XAI methods, users still struggle to obtain
+required AI explanations. Previous research suggests chatbots as dynamic
+solutions, but the effective design of conversational XAI agents for practical
+human needs remains under-explored. This paper focuses on Conversational XAI
+for AI-assisted scientific writing tasks. Drawing from human linguistic
+theories and formative studies, we identify four design rationales:
+"multifaceted", "controllability", "mix-initiative", "context-aware
+drill-down". We incorporate them into an interactive prototype, ConvXAI, which
+facilitates heterogeneous AI explanations for scientific writing through
+dialogue. In two studies with 21 users, ConvXAI outperforms a GUI-based
+baseline on improving human-perceived understanding and writing improvement.
+The paper further discusses the practical human usage patterns in interacting
+with ConvXAI for scientific co-writing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in CSCW 2023 Demo. ConvXAI system code:
+  https://github.com/huashen218/convxai.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding the Difficulty of Training <span class="highlight-title">Transformer</span>s <span class="chip">EMNLP 2020</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2004.08249v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2004.08249v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Liu, Xiaodong Liu, Jianfeng Gao, Weizhu Chen, Jiawei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have proved effective in many NLP tasks. However, their training
+requires non-trivial efforts regarding designing cutting-edge optimizers and
+learning rate schedulers carefully (e.g., conventional SGD fails to train
+Transformers effectively). Our objective here is to understand $\textit{what
+complicates Transformer training}$ from both empirical and theoretical
+perspectives. Our analysis reveals that unbalanced gradients are not the root
+cause of the instability of training. Instead, we identify an amplification
+effect that influences training substantially -- for each layer in a
+multi-layer Transformer model, heavy dependency on its residual branch makes
+training unstable, since it amplifies small parameter perturbations (e.g.,
+parameter updates) and results in significant disturbances in the model output.
+Yet we observe that a light dependency limits the model potential and leads to
+inferior trained models. Inspired by our analysis, we propose Admin
+($\textbf{Ad}$aptive $\textbf{m}$odel $\textbf{in}$itialization) to stabilize
+stabilize the early stage's training and unleash its full potential in the late
+stage. Extensive experiments show that Admin is more stable, converges faster,
+and leads to better performance. Implementations are released at:
+https://github.com/LiyuanLucasLiu/Transforemr-Clinic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenBA: An Open-sourced 15B Bilingual Asymmetric seq2seq Model
+  <span class="highlight-title">Pre-train</span>ed from Scratch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10706v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10706v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Li, Zecheng Tang, Yuyang Ding, Pinzheng Wang, Pei Guo, Wangjie You, Dan Qiao, Wenliang Chen, Guohong Fu, Qiaoming Zhu, Guodong Zhou, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) with billions of parameters have demonstrated
+outstanding performance on various natural language processing tasks. This
+report presents OpenBA, an open-sourced 15B bilingual asymmetric seq2seq model,
+to contribute an LLM variant to the Chinese-oriented open-source model
+community. We enhance OpenBA with effective and efficient techniques as well as
+adopt a three-stage training strategy to train the model from scratch. Our
+solution can also achieve very competitive performance with only 380B tokens,
+which is better than LLaMA-70B on the BELEBELE benchmark, BLOOM-176B on the
+MMLU benchmark, GLM-130B on the C-Eval (hard) benchmark. This report provides
+the main details to pre-train an analogous model, including pre-training data
+processing, Bilingual Flan data collection, the empirical observations that
+inspire our model architecture design, training objectives of different stages,
+and other enhancement techniques. Additionally, we also provide the fine-tuning
+details of OpenBA on four downstream tasks. We have refactored our code to
+follow the design principles of the Huggingface Transformers Library, making it
+more convenient for developers to use, and released checkpoints of different
+training stages at https://huggingface.co/openBA. More details of our project
+are available at https://github.com/OpenNLG/openBA.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAmmoTH: Building Math Generalist Models through Hybrid Instruction
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Yue, Xingwei Qu, Ge Zhang, Yao Fu, Wenhao Huang, Huan Sun, Yu Su, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MAmmoTH, a series of open-source large language models (LLMs)
+specifically tailored for general math problem-solving. The MAmmoTH models are
+trained on MathInstruct, our meticulously curated instruction tuning dataset.
+MathInstruct is compiled from 13 math datasets with intermediate rationales,
+six of which have rationales newly curated by us. It presents a unique hybrid
+of chain-of-thought (CoT) and program-of-thought (PoT) rationales, and also
+ensures extensive coverage of diverse fields in math. The hybrid of CoT and PoT
+not only unleashes the potential of tool use but also allows different thought
+processes for different math problems. As a result, the MAmmoTH series
+substantially outperform existing open-source models on nine mathematical
+reasoning datasets across all scales with an average accuracy gain between 16%
+and 32%. Remarkably, our MAmmoTH-7B model reaches 33% on MATH (a
+competition-level dataset), which exceeds the best open-source 7B model
+(WizardMath) by 23%, and the MAmmoTH-34B model achieves 44% accuracy on MATH,
+even surpassing GPT-4's CoT result. Our work underscores the importance of
+diverse problem coverage and the use of hybrid rationales in developing
+superior math generalist models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; Xiang Yue and Wenhu Chen contributed equally to
+  this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large-Scale Bidirectional Training for Zero-Shot Image Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taehoon Kim, Mark Marsden, Pyunghwan Ahn, Sangyun Kim, Sihaeng Lee, Alessandra Sala, Seung Hwan Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When trained on large-scale datasets, image captioning models can understand
+the content of images from a general domain but often fail to generate
+accurate, detailed captions. To improve performance, pretraining-and-finetuning
+has been a key strategy for image captioning. However, we find that large-scale
+bidirectional training between image and text enables zero-shot image
+captioning. In this paper, we introduce Bidirectional Image Text Training in
+largER Scale, BITTERS, an efficient training and inference framework for
+zero-shot image captioning. We also propose a new evaluation benchmark which
+comprises of high quality datasets and an extensive set of metrics to properly
+evaluate zero-shot captioning accuracy and societal bias. We additionally
+provide an efficient finetuning approach for keyword extraction. We show that
+careful selection of large-scale training set and model architecture is the key
+to achieving zero-shot image captioning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Arxiv Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Robustness of AI Offensive Code Generators via Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristina Improta, Pietro Liguori, Roberto Natella, Bojan Cukic, Domenico Cotroneo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present a method to add perturbations to the code
+descriptions to create new inputs in natural language (NL) from
+well-intentioned developers that diverge from the original ones due to the use
+of new words or because they miss part of them. The goal is to analyze how and
+to what extent perturbations affect the performance of AI code generators in
+the context of security-oriented code. First, we show that perturbed
+descriptions preserve the semantics of the original, non-perturbed ones. Then,
+we use the method to assess the robustness of three state-of-the-art code
+generators against the newly perturbed inputs, showing that the performance of
+these AI-based solutions is highly affected by perturbations in the NL
+descriptions. To enhance their robustness, we use the method to perform data
+augmentation, i.e., to increase the variability and diversity of the NL
+descriptions in the training data, proving its effectiveness against both
+perturbed and non-perturbed code descriptions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring the Instability of Fine-Tuning <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Du, Dong Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pre-trained language models on downstream tasks with varying
+random seeds has been shown to be unstable, especially on small datasets. Many
+previous studies have investigated this instability and proposed methods to
+mitigate it. However, most studies only used the standard deviation of
+performance scores (SD) as their measure, which is a narrow characterization of
+instability. In this paper, we analyze SD and six other measures quantifying
+instability at different levels of granularity. Moreover, we propose a
+systematic framework to evaluate the validity of these measures. Finally, we
+analyze the consistency and difference between different measures by
+reassessing existing instability mitigation methods. We hope our results will
+inform the development of better measurements of fine-tuning instability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 26 Figures, accepted to ACL 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web automation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that learns from self-experience to
+complete tasks on real websites following natural language instructions.
+WebAgent plans ahead by decomposing instructions into canonical
+sub-instructions, summarizes long HTML documents into task-relevant snippets,
+and acts on websites via Python programs generated from those. We design
+WebAgent with Flan-U-PaLM, for grounded code generation, and HTML-T5, new
+pre-trained LLMs for long HTML documents using local and global attention
+mechanisms and a mixture of long-span denoising objectives, for planning and
+summarization. We empirically demonstrate that our modular recipe improves the
+success on real websites by over 50%, and that HTML-T5 is the best model to
+solve various HTML understanding tasks; achieving 18.7% higher success rate
+than the prior method on MiniWoB web automation benchmark, and SoTA performance
+on Mind2Web, an offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Feed-Forward Blocks in <span class="highlight-title">Transformer</span>s through the Lens of
+  Attention Map 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Goro Kobayashi, Tatsuki Kuribayashi, Sho Yokoi, Kentaro Inui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given that Transformers are ubiquitous in wide tasks, interpreting their
+internals is a pivotal issue. Still, their particular components, feed-forward
+(FF) blocks, have typically been less analyzed despite their substantial
+parameter amounts. We analyze the input contextualization effects of FF blocks
+by rendering them in the attention maps as a human-friendly visualization
+scheme. Our experiments with both masked- and causal-language models reveal
+that FF networks modify the input contextualization to emphasize specific types
+of linguistic compositions. In addition, FF and its surrounding components tend
+to cancel out each other's effects, suggesting potential redundancy in the
+processing of the Transformer layer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 25 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Travel in LLMs: Tracing Data Contamination in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahriar Golchin, Mihai Surdeanu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data contamination, i.e., the presence of test data from downstream tasks in
+the training data of large language models (LLMs), is a potential major issue
+in measuring LLMs' real effectiveness on other tasks. We propose a
+straightforward yet effective method for identifying data contamination within
+LLMs. At its core, our approach starts by identifying potential contamination
+at the instance level; using this information, our approach then assesses wider
+contamination at the partition level. To estimate contamination of individual
+instances, we employ "guided instruction:" a prompt consisting of the dataset
+name, partition type, and the random-length initial segment of a reference
+instance, asking the LLM to complete it. An instance is flagged as contaminated
+if the LLM's output either exactly or nearly matches the latter segment of the
+reference. To understand if an entire partition is contaminated, we propose two
+ideas. The first idea marks a dataset partition as contaminated if the average
+overlap score with the reference instances (as measured by ROUGE-L or BLEURT)
+is statistically significantly better with the completions from guided
+instruction compared to a "general instruction" that does not include the
+dataset and partition name. The second idea marks a dataset partition as
+contaminated if a classifier based on GPT-4 with few-shot in-context learning
+prompt marks multiple generated completions as exact/near-exact matches of the
+corresponding reference instances. Our best method achieves an accuracy between
+92% and 100% in detecting if an LLM is contaminated with seven datasets,
+containing train and test/validation partitions, when contrasted with manual
+evaluation by human experts. Further, our findings indicate that GPT-4 is
+contaminated with AG News, WNLI, and XSum datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2 preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-contradictory Hallucinations of Large Language Models: Evaluation,
+  Detection and Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niels Mündler, Jingxuan He, Slobodan Jenko, Martin Vechev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (large LMs) are susceptible to producing text that
+contains hallucinated content. An important instance of this problem is
+self-contradiction, where the LM generates two contradictory sentences within
+the same context. In this work, we present a comprehensive investigation into
+self-contradiction for various instruction-tuned LMs, covering evaluation,
+detection, and mitigation. Our analysis reveals the prevalence of
+self-contradictions when LMs generate text for open-domain topics, e.g., in
+17.7% of all sentences produced by ChatGPT. Self-contradiction also complements
+retrieval-based methods, as a large portion of them (e.g., 35.8% for ChatGPT)
+cannot be verified using Wikipedia. We then propose a novel prompting-based
+framework designed to effectively detect and mitigate self-contradictions. Our
+detector achieves high accuracy, e.g., around 80% F1 score when prompting
+ChatGPT. The mitigation algorithm iteratively refines the generated text to
+remove contradictory information while preserving text fluency and
+informativeness. Importantly, our entire framework is applicable to black-box
+LMs and does not require external grounded knowledge. Our approach is
+practically effective and has been released as a push-button tool to benefit
+the public, available at https://chatprotect.ai/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Crisis<span class="highlight-title">Transformer</span>s: <span class="highlight-title">Pre-train</span>ed language models and sentence encoders
+  for crisis-related social media texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rabindra Lamsal, Maria Rodriguez Read, Shanika Karunasekera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms play an essential role in crisis communication, but
+analyzing crisis-related social media texts is challenging due to their
+informal nature. Transformer-based pre-trained models like BERT and RoBERTa
+have shown success in various NLP tasks, but they are not tailored for
+crisis-related texts. Furthermore, general-purpose sentence encoders are used
+to generate sentence embeddings, regardless of the textual complexities in
+crisis-related texts. Advances in applications like text classification,
+semantic search, and clustering contribute to effective processing of
+crisis-related texts, which is essential for emergency responders to gain a
+comprehensive view of a crisis event, whether historical or real-time. To
+address these gaps in crisis informatics literature, this study introduces
+CrisisTransformers, an ensemble of pre-trained language models and sentence
+encoders trained on an extensive corpus of over 15 billion word tokens from
+tweets associated with more than 30 crisis events, including disease outbreaks,
+natural disasters, conflicts, and other critical incidents. We evaluate
+existing models and CrisisTransformers on 18 crisis-specific public datasets.
+Our pre-trained models outperform strong baselines across all datasets in
+classification tasks, and our best-performing sentence encoder improves the
+state-of-the-art by 17.43% in sentence encoding tasks. Additionally, we
+investigate the impact of model initialization on convergence and evaluate the
+significance of domain-specific models in generating semantically meaningful
+sentence embeddings. All models are publicly released
+(https://huggingface.co/crisistransformers), with the anticipation that they
+will serve as a robust baseline for tasks involving the analysis of
+crisis-related social media texts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Recommendations with <span class="highlight-title">Pre-Train</span>ed Large Language Models for
+  Multimodal Nudging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachel M. Harrison, Anton Dereventsov, Anton Bibin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for zero-shot recommendation of multimodal non-stationary
+content that leverages recent advancements in the field of generative AI. We
+propose rendering inputs of different modalities as textual descriptions and to
+utilize pre-trained LLMs to obtain their numerical representations by computing
+semantic embeddings. Once unified representations of all content items are
+obtained, the recommendation can be performed by computing an appropriate
+similarity metric between them without any additional learning. We demonstrate
+our approach on a synthetic multimodal nudging environment, where the inputs
+consist of tabular, textual, and visual data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">28</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Neurons in <span class="highlight-title">Pretrain</span>ed Text-Only <span class="highlight-title">Transformer</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01544v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01544v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Schwettmann, Neil Chowdhury, Samuel Klein, David Bau, Antonio Torralba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models demonstrate remarkable capacity to generalize representations
+learned in one modality to downstream tasks in other modalities. Can we trace
+this ability to individual neurons? We study the case where a frozen text
+transformer is augmented with vision using a self-supervised visual encoder and
+a single linear projection learned on an image-to-text task. Outputs of the
+projection layer are not immediately decodable into language describing image
+content; instead, we find that translation between modalities occurs deeper
+within the transformer. We introduce a procedure for identifying "multimodal
+neurons" that convert visual representations into corresponding text, and
+decoding the concepts they inject into the model's residual stream. In a series
+of experiments, we show that multimodal neurons operate on specific visual
+concepts across inputs, and have a systematic causal effect on image
+captioning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Oral presentation at ICCV CLVL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SeeABLE: Soft Discrepancies and Bounded Contrastive Learning for
+  Exposing Deepfakes <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Larue, Ngoc-Son Vu, Vitomir Struc, Peter Peer, Vassilis Christophides
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deepfake detectors have achieved encouraging results, when training
+and test images are drawn from the same data collection. However, when these
+detectors are applied to images produced with unknown deepfake-generation
+techniques, considerable performance degradations are commonly observed. In
+this paper, we propose a novel deepfake detector, called SeeABLE, that
+formalizes the detection problem as a (one-class) out-of-distribution detection
+task and generalizes better to unseen deepfakes. Specifically, SeeABLE first
+generates local image perturbations (referred to as soft-discrepancies) and
+then pushes the perturbed faces towards predefined prototypes using a novel
+regression-based bounded contrastive loss. To strengthen the generalization
+performance of SeeABLE to unknown deepfake types, we generate a rich set of
+soft discrepancies and train the detector: (i) to localize, which part of the
+face was modified, and (ii) to identify the alteration type. To demonstrate the
+capabilities of SeeABLE, we perform rigorous experiments on several widely-used
+deepfake datasets and show that our model convincingly outperforms competing
+state-of-the-art detectors, while exhibiting highly encouraging generalization
+capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EgoTracks: A Long-term Egocentric Visual Object Tracking <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03213v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03213v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Tang, Kevin Liang, Matt Feiszli, Weiyao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual object tracking is a key component to many egocentric vision problems.
+However, the full spectrum of challenges of egocentric tracking faced by an
+embodied AI is underrepresented in many existing datasets; these tend to focus
+on relatively short, third-person videos. Egocentric video has several
+distinguishing characteristics from those commonly found in past datasets:
+frequent large camera motions and hand interactions with objects commonly lead
+to occlusions or objects exiting the frame, and object appearance can change
+rapidly due to widely different points of view, scale, or object states.
+Embodied tracking is also naturally long-term, and being able to consistently
+(re-)associate objects to their appearances and disappearances over as long as
+a lifetime is critical. Previous datasets under-emphasize this re-detection
+problem, and their "framed" nature has led to adoption of various
+spatiotemporal priors that we find do not necessarily generalize to egocentric
+video. We thus introduce EgoTracks, a new dataset for long-term egocentric
+visual object tracking. Sourced from the Ego4D dataset, this new dataset
+presents a significant challenge to recent state-of-the-art single-object
+tracking models, which we find score poorly on traditional tracking metrics for
+our new dataset, compared to popular benchmarks. We further show improvements
+that can be made to a STARK tracker to significantly increase its performance
+on egocentric data, resulting in a baseline model we call EgoSTARK. We publicly
+release our annotations and benchmark, hoping our dataset leads to further
+advancements in tracking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Diffusion Models with Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13301v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13301v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Black, Michael Janner, Yilun Du, Ilya Kostrikov, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are a class of flexible generative models trained with an
+approximation to the log-likelihood objective. However, most use cases of
+diffusion models are not concerned with likelihoods, but instead with
+downstream objectives such as human-perceived image quality or drug
+effectiveness. In this paper, we investigate reinforcement learning methods for
+directly optimizing diffusion models for such objectives. We describe how
+posing denoising as a multi-step decision-making problem enables a class of
+policy gradient algorithms, which we refer to as denoising diffusion policy
+optimization (DDPO), that are more effective than alternative reward-weighted
+likelihood approaches. Empirically, DDPO is able to adapt text-to-image
+diffusion models to objectives that are difficult to express via prompting,
+such as image compressibility, and those derived from human feedback, such as
+aesthetic quality. Finally, we show that DDPO can improve prompt-image
+alignment using feedback from a vision-language model without the need for
+additional data collection or human annotation. The project's website can be
+found at http://rl-diffusion.github.io .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenMixup: A Comprehensive Mixup Benchmark for Visual Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04851v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04851v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Li, Zedong Wang, Zicheng Liu, Di Wu, Cheng Tan, Weiyang Jin, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data mixing, or mixup, is a data-dependent augmentation technique that has
+greatly enhanced the generalizability of modern deep neural networks. However,
+a full grasp of mixup methodology necessitates a top-down hierarchical
+understanding from systematic impartial evaluations and empirical analysis,
+both of which are currently lacking within the community. In this paper, we
+present OpenMixup, the first comprehensive mixup benchmarking study for
+supervised visual classification. OpenMixup offers a unified mixup-based model
+design and training framework, encompassing a wide collection of data mixing
+algorithms, a diverse range of widely-used backbones and modules, and a set of
+model analysis toolkits. To ensure fair and complete comparisons, large-scale
+standard evaluations of various mixup baselines are conducted across 12
+diversified image datasets with meticulous confounders and tweaking powered by
+our modular and extensible codebase framework. Interesting observations and
+insights are derived through detailed empirical analysis of how mixup policies,
+network architectures, and dataset properties affect the mixup visual
+classification performance. We hope that OpenMixup can bolster the
+reproducibility of previously gained insights and facilitate a better
+understanding of mixup properties, thereby giving the community a kick-start
+for the development and evaluation of new mixup methods. The source code and
+user documents are available at \url{https://github.com/Westlake-AI/openmixup}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint V2. The source code is available at
+  https://github.com/Westlake-AI/openmixup</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Project Aria: A New Tool for Egocentric Multi-Modal AI Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13561v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13561v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Engel, Kiran Somasundaram, Michael Goesele, Albert Sun, Alexander Gamino, Andrew Turner, Arjang Talattof, Arnie Yuan, Bilal Souti, Brighid Meredith, Cheng Peng, Chris Sweeney, Cole Wilson, Dan Barnes, Daniel DeTone, David Caruso, Derek Valleroy, Dinesh Ginjupalli, Duncan Frost, Edward Miller, Elias Mueggler, Evgeniy Oleinik, Fan Zhang, Guruprasad Somasundaram, Gustavo Solaira, Harry Lanaras, Henry Howard-Jenkins, Huixuan Tang, Hyo Jin Kim, Jaime Rivera, Ji Luo, Jing Dong, Julian Straub, Kevin Bailey, Kevin Eckenhoff, Lingni Ma, Luis Pesqueira, Mark Schwesinger, Maurizio Monge, Nan Yang, Nick Charron, Nikhil Raina, Omkar Parkhi, Peter Borschowa, Pierre Moulon, Prince Gupta, Raul Mur-Artal, Robbie Pennington, Sachin Kulkarni, Sagar Miglani, Santosh Gondi, Saransh Solanki, Sean Diener, Shangyi Cheng, Simon Green, Steve Saarinen, Suvam Patra, Tassos Mourikis, Thomas Whelan, Tripti Singh, Vasileios Balntas, Vijay Baiyya, Wilson Dreewes, Xiaqing Pan, Yang Lou, Yipu Zhao, Yusuf Mansour, Yuyang Zou, Zhaoyang Lv, Zijian Wang, Mingfei Yan, Carl Ren, Renzo De Nardi, Richard Newcombe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Egocentric, multi-modal data as available on future augmented reality (AR)
+devices provides unique challenges and opportunities for machine perception.
+These future devices will need to be all-day wearable in a socially acceptable
+form-factor to support always available, context-aware and personalized AI
+applications. Our team at Meta Reality Labs Research built the Aria device, an
+egocentric, multi-modal data recording and streaming device with the goal to
+foster and accelerate research in this area. In this paper, we describe the
+Aria device hardware including its sensor configuration and the corresponding
+software tools that enable recording and processing of such data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FUTURE-AI: Guiding Principles and Consensus Recommendations for
+  Trustworthy Artificial Intelligence in Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.09658v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.09658v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Lekadir, Richard Osuala, Catherine Gallin, Noussair Lazrak, Kaisar Kushibar, Gianna Tsakou, Susanna Aussó, Leonor Cerdá Alberich, Kostas Marias, Manolis Tsiknakis, Sara Colantonio, Nickolas Papanikolaou, Zohaib Salahuddin, Henry C Woodruff, Philippe Lambin, Luis Martí-Bonmatí
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancements in artificial intelligence (AI) combined with the
+extensive amount of data generated by today's clinical systems, has led to the
+development of imaging AI solutions across the whole value chain of medical
+imaging, including image reconstruction, medical image segmentation,
+image-based diagnosis and treatment planning. Notwithstanding the successes and
+future potential of AI in medical imaging, many stakeholders are concerned of
+the potential risks and ethical implications of imaging AI solutions, which are
+perceived as complex, opaque, and difficult to comprehend, utilise, and trust
+in critical clinical applications. Despite these concerns and risks, there are
+currently no concrete guidelines and best practices for guiding future AI
+developments in medical imaging towards increased trust, safety and adoption.
+To bridge this gap, this paper introduces a careful selection of guiding
+principles drawn from the accumulated experiences, consensus, and best
+practices from five large European projects on AI in Health Imaging. These
+guiding principles are named FUTURE-AI and its building blocks consist of (i)
+Fairness, (ii) Universality, (iii) Traceability, (iv) Usability, (v) Robustness
+and (vi) Explainability. In a step-by-step approach, these guidelines are
+further translated into a framework of concrete recommendations for specifying,
+developing, evaluating, and deploying technically, clinically and ethically
+trustworthy AI solutions into clinical practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please refer to arXiv:2309.12325 for the latest FUTURE-AI framework
+  for healthcare</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness Continual Learning Approach to Semantic Scene Understanding in
+  Open-World Environments <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15700v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15700v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thanh-Dat Truong, Hoang-Quan Nguyen, Bhiksha Raj, Khoa Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual semantic segmentation aims to learn new classes while maintaining
+the information from the previous classes. Although prior studies have shown
+impressive progress in recent years, the fairness concern in the continual
+semantic segmentation needs to be better addressed. Meanwhile, fairness is one
+of the most vital factors in deploying the deep learning model, especially in
+human-related or safety applications. In this paper, we present a novel
+Fairness Continual Learning approach to the semantic segmentation problem. In
+particular, under the fairness objective, a new fairness continual learning
+framework is proposed based on class distributions. Then, a novel Prototypical
+Contrastive Clustering loss is proposed to address the significant challenges
+in continual learning, i.e., catastrophic forgetting and background shift. Our
+proposed loss has also been proven as a novel, generalized learning paradigm of
+knowledge distillation commonly used in continual learning. Moreover, the
+proposed Conditional Structural Consistency loss further regularized the
+structural constraint of the predicted segmentation. Our proposed approach has
+achieved State-of-the-Art performance on three standard scene understanding
+benchmarks, i.e., ADE20K, Cityscapes, and Pascal VOC, and promoted the fairness
+of the segmentation model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Inpainting for Single-Image Shadow Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05361v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05361v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoguang Li, Qing Guo, Rabab Abdelfattah, Di Lin, Wei Feng, Ivor Tsang, Song Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully-supervised shadow removal methods achieve the best restoration
+qualities on public datasets but still generate some shadow remnants. One of
+the reasons is the lack of large-scale shadow & shadow-free image pairs.
+Unsupervised methods can alleviate the issue but their restoration qualities
+are much lower than those of fully-supervised methods. In this work, we find
+that pretraining shadow removal networks on the image inpainting dataset can
+reduce the shadow remnants significantly: a naive encoder-decoder network gets
+competitive restoration quality w.r.t. the state-of-the-art methods via only
+10% shadow & shadow-free image pairs. After analyzing networks with/without
+inpainting pre-training via the information stored in the weight (IIW), we find
+that inpainting pretraining improves restoration quality in non-shadow regions
+and enhances the generalization ability of networks significantly.
+Additionally, shadow removal fine-tuning enables networks to fill in the
+details of shadow regions. Inspired by these observations we formulate shadow
+removal as an adaptive fusion task that takes advantage of both shadow removal
+and image inpainting. Specifically, we develop an adaptive fusion network
+consisting of two encoders, an adaptive fusion block, and a decoder. The two
+encoders are responsible for extracting the feature from the shadow image and
+the shadow-masked image respectively. The adaptive fusion block is responsible
+for combining these features in an adaptive manner. Finally, the decoder
+converts the adaptive fused features to the desired shadow-free result. The
+extensive experiments show that our method empowered with inpainting
+outperforms all state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AerialFormer: Multi-resolution <span class="highlight-title">Transformer</span> for Aerial Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kashu Yamazaki, Taisei Hanyu, Minh Tran, Adrian de Luis, Roy McCann, Haitao Liao, Chase Rainwater, Meredith Adkins, Jackson Cothren, Ngan Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aerial Image Segmentation is a top-down perspective semantic segmentation and
+has several challenging characteristics such as strong imbalance in the
+foreground-background distribution, complex background, intra-class
+heterogeneity, inter-class homogeneity, and tiny objects. To handle these
+problems, we inherit the advantages of Transformers and propose AerialFormer,
+which unifies Transformers at the contracting path with lightweight
+Multi-Dilated Convolutional Neural Networks (MD-CNNs) at the expanding path.
+Our AerialFormer is designed as a hierarchical structure, in which Transformer
+encoder outputs multi-scale features and MD-CNNs decoder aggregates information
+from the multi-scales. Thus, it takes both local and global contexts into
+consideration to render powerful representations and high-resolution
+segmentation. We have benchmarked AerialFormer on three common datasets
+including iSAID, LoveDA, and Potsdam. Comprehensive experiments and extensive
+ablation studies show that our proposed AerialFormer outperforms previous
+state-of-the-art methods with remarkable performance. Our source code will be
+publicly available upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Silent Killer: A Stealthy, Clean-Label, Black-Box Backdoor Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.02615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.02615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tzvi Lederer, Gallil Maimon, Lior Rokach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor poisoning attacks pose a well-known risk to neural networks.
+However, most studies have focused on lenient threat models. We introduce
+Silent Killer, a novel attack that operates in clean-label, black-box settings,
+uses a stealthy poison and trigger and outperforms existing methods. We
+investigate the use of universal adversarial perturbations as triggers in
+clean-label attacks, following the success of such approaches under
+poison-label settings. We analyze the success of a naive adaptation and find
+that gradient alignment for crafting the poison is required to ensure high
+success rates. We conduct thorough experiments on MNIST, CIFAR10, and a reduced
+version of ImageNet and achieve state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kehong Gong, Dongze Lian, Heng Chang, Chuan Guo, Zihang Jiang, Xinxin Zuo, Michael Bi Mi, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel task for generating 3D dance movements that simultaneously
+incorporate both text and music modalities. Unlike existing works that generate
+dance movements using a single modality such as music, our goal is to produce
+richer dance movements guided by the instructive information provided by the
+text. However, the lack of paired motion data with both music and text
+modalities limits the ability to generate dance movements that integrate both.
+To alleviate this challenge, we propose to utilize a 3D human motion VQ-VAE to
+project the motions of the two datasets into a latent space consisting of
+quantized vectors, which effectively mix the motion tokens from the two
+datasets with different distributions for training. Additionally, we propose a
+cross-modal transformer to integrate text instructions into motion generation
+architecture for generating 3D dance movements without degrading the
+performance of music-conditioned dance generation. To better evaluate the
+quality of the generated motion, we introduce two novel metrics, namely Motion
+Prediction Distance (MPD) and Freezing Score (FS), to measure the coherence and
+freezing percentage of the generated motion. Extensive experiments show that
+our approach can generate realistic and coherent dance movements conditioned on
+both text and music while maintaining comparable performance with the two
+single modalities. Code is available at https://garfield-kh.github.io/TM2D/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMASD: A Multimodal <span class="highlight-title">Dataset</span> for Autism Intervention Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08243v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08243v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jicheng Li, Vuthea Chheang, Pinar Kullu, Eli Brignac, Zhang Guo, Kenneth E. Barner, Anjana Bhat, Roghayeh Leila Barmaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is a developmental disorder characterized by
+significant social communication impairments and difficulties perceiving and
+presenting communication cues. Machine learning techniques have been broadly
+adopted to facilitate autism studies and assessments. However, computational
+models are primarily concentrated on specific analysis and validated on private
+datasets in the autism community, which limits comparisons across models due to
+privacy-preserving data sharing complications. This work presents a novel
+privacy-preserving open-source dataset, MMASD as a MultiModal ASD benchmark
+dataset, collected from play therapy interventions of children with Autism.
+MMASD includes data from 32 children with ASD, and 1,315 data samples segmented
+from over 100 hours of intervention recordings. To promote public access, each
+data sample consists of four privacy-preserving modalities of data; some of
+which are derived from original videos: (1) optical flow, (2) 2D skeleton, (3)
+3D skeleton, and (4) clinician ASD evaluation scores of children, e.g., ADOS
+scores. MMASD aims to assist researchers and therapists in understanding
+children's cognitive status, monitoring their progress during therapy, and
+customizing the treatment plan accordingly. It also has inspiration for
+downstream tasks such as action quality assessment and interpersonal synchrony
+estimation. MMASD dataset can be easily accessed at
+https://github.com/Li-Jicheng/MMASD-A-Multimodal-Dataset-for-Autism-Intervention-Analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KITE: Keypoint-Conditioned Policies for Semantic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16605v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16605v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priya Sundaresan, Suneel Belkhale, Dorsa Sadigh, Jeannette Bohg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While natural language offers a convenient shared interface for humans and
+robots, enabling robots to interpret and follow language commands remains a
+longstanding challenge in manipulation. A crucial step to realizing a
+performant instruction-following robot is achieving semantic manipulation,
+where a robot interprets language at different specificities, from high-level
+instructions like "Pick up the stuffed animal" to more detailed inputs like
+"Grab the left ear of the elephant." To tackle this, we propose Keypoints +
+Instructions to Execution (KITE), a two-step framework for semantic
+manipulation which attends to both scene semantics (distinguishing between
+different objects in a visual scene) and object semantics (precisely localizing
+different parts within an object instance). KITE first grounds an input
+instruction in a visual scene through 2D image keypoints, providing a highly
+accurate object-centric bias for downstream action inference. Provided an RGB-D
+scene observation, KITE then executes a learned keypoint-conditioned skill to
+carry out the instruction. The combined precision of keypoints and
+parameterized skills enables fine-grained manipulation with generalization to
+scene and object variations. Empirically, we demonstrate KITE in 3 real-world
+environments: long-horizon 6-DoF tabletop manipulation, semantic grasping, and
+a high-precision coffee-making task. In these settings, KITE achieves a 75%,
+70%, and 71% overall success rate for instruction-following, respectively. KITE
+outperforms frameworks that opt for pre-trained visual language models over
+keypoint-based grounding, or omit skills in favor of end-to-end visuomotor
+control, all while being trained from fewer or comparable amounts of
+demonstrations. Supplementary material, datasets, code, and videos can be found
+on our website: http://tinyurl.com/kite-site.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class Incremental Learning via Likelihood Ratio Based Task Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Lin, Yijia Shao, Weinan Qian, Ningxin Pan, Yiduo Guo, Bing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class incremental learning (CIL) is a challenging setting of continual
+learning, which learns a series of tasks sequentially. Each task consists of a
+set of unique classes. The key feature of CIL is that no task identifier (or
+task-id) is provided at test time for each test sample. Predicting the task-id
+for each test sample is a challenging problem. An emerging theoretically
+justified and effective approach is to train a task-specific model for each
+task in a shared network for all tasks based on a task-incremental learning
+(TIL) method to deal with forgetting. The model for each task in this approach
+is an out-of-distribution (OOD) detector rather than a conventional classifier.
+The OOD detector can perform both within-task (in-distribution (IND)) class
+prediction and OOD detection. The OOD detection capability is the key for
+task-id prediction during inference for each test sample. However, this paper
+argues that using a traditional OOD detector for task-id prediction is
+sub-optimal because additional information (e.g., the replay data and the
+learned tasks) available in CIL can be exploited to design a better and
+principled method for task-id prediction. We call the new method TPLR (Task-id
+Prediction based on Likelihood Ratio}). TPLR markedly outperforms strong CIL
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ClusterFormer: Clustering As A Universal Visual Learner 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13196v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13196v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James C. Liang, Yiming Cui, Qifan Wang, Tong Geng, Wenguan Wang, Dongfang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents CLUSTERFORMER, a universal vision model that is based on
+the CLUSTERing paradigm with TransFORMER. It comprises two novel designs: 1.
+recurrent cross-attention clustering, which reformulates the cross-attention
+mechanism in Transformer and enables recursive updates of cluster centers to
+facilitate strong representation learning; and 2. feature dispatching, which
+uses the updated cluster centers to redistribute image features through
+similarity-based metrics, resulting in a transparent pipeline. This elegant
+design streamlines an explainable and transferable workflow, capable of
+tackling heterogeneous vision tasks (i.e., image classification, object
+detection, and image segmentation) with varying levels of clustering
+granularity (i.e., image-, box-, and pixel-level). Empirical results
+demonstrate that CLUSTERFORMER outperforms various well-known specialized
+architectures, achieving 83.41% top-1 acc. over ImageNet-1K for image
+classification, 54.2% and 47.0% mAP over MS COCO for object detection and
+instance segmentation, 52.4% mIoU over ADE20K for semantic segmentation, and
+55.8% PQ over COCO Panoptic for panoptic segmentation. For its efficacy, we
+hope our work can catalyze a paradigm shift in universal models in computer
+vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large-Scale Bidirectional Training for Zero-Shot Image Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taehoon Kim, Mark Marsden, Pyunghwan Ahn, Sangyun Kim, Sihaeng Lee, Alessandra Sala, Seung Hwan Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When trained on large-scale datasets, image captioning models can understand
+the content of images from a general domain but often fail to generate
+accurate, detailed captions. To improve performance, pretraining-and-finetuning
+has been a key strategy for image captioning. However, we find that large-scale
+bidirectional training between image and text enables zero-shot image
+captioning. In this paper, we introduce Bidirectional Image Text Training in
+largER Scale, BITTERS, an efficient training and inference framework for
+zero-shot image captioning. We also propose a new evaluation benchmark which
+comprises of high quality datasets and an extensive set of metrics to properly
+evaluate zero-shot captioning accuracy and societal bias. We additionally
+provide an efficient finetuning approach for keyword extraction. We show that
+careful selection of large-scale training set and model architecture is the key
+to achieving zero-shot image captioning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Arxiv Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ResFields: Residual Neural Fields for Spatiotemporal Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marko Mihajlovic, Sergey Prokudin, Marc Pollefeys, Siyu Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural fields, a category of neural networks trained to represent
+high-frequency signals, have gained significant attention in recent years due
+to their impressive performance in modeling complex 3D data, especially large
+neural signed distance (SDFs) or radiance fields (NeRFs) via a single
+multi-layer perceptron (MLP). However, despite the power and simplicity of
+representing signals with an MLP, these methods still face challenges when
+modeling large and complex temporal signals due to the limited capacity of
+MLPs. In this paper, we propose an effective approach to address this
+limitation by incorporating temporal residual layers into neural fields, dubbed
+ResFields, a novel class of networks specifically designed to effectively
+represent complex temporal signals. We conduct a comprehensive analysis of the
+properties of ResFields and propose a matrix factorization technique to reduce
+the number of trainable parameters and enhance generalization capabilities.
+Importantly, our formulation seamlessly integrates with existing techniques and
+consistently improves results across various challenging tasks: 2D video
+approximation, dynamic shape modeling via temporal SDFs, and dynamic NeRF
+reconstruction. Lastly, we demonstrate the practical utility of ResFields by
+showcasing its effectiveness in capturing dynamic 3D scenes from sparse sensory
+inputs of a lightweight capture system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page and code at https://markomih.github.io/ResFields/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Building and Road Segmentation Using EffUNet and Transfer Learning
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahil Gangurde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In city, information about urban objects such as water supply, railway lines,
+power lines, buildings, roads, etc., is necessary for city planning. In
+particular, information about the spread of these objects, locations and
+capacity is needed for the policymakers to make impactful decisions. This
+thesis aims to segment the building and roads from the aerial image captured by
+the satellites and UAVs. Many different architectures have been proposed for
+the semantic segmentation task and UNet being one of them. In this thesis, we
+propose a novel architecture based on Google's newly proposed EfficientNetV2 as
+an encoder for feature extraction with UNet decoder for constructing the
+segmentation map. Using this approach we achieved a benchmark score for the
+Massachusetts Building and Road dataset with an mIOU of 0.8365 and 0.9153
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The transformer network analysis was not included in the current
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Polychromatic Neural Representation for CT Metal Artifact
+  Reduction <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15203v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15203v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Wu, Lixuan Chen, Ce Wang, Hongjiang Wei, S. Kevin Zhou, Jingyi Yu, Yuyao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging neural reconstruction techniques based on tomography (e.g., NeRF,
+NeAT, and NeRP) have started showing unique capabilities in medical imaging. In
+this work, we present a novel Polychromatic neural representation (Polyner) to
+tackle the challenging problem of CT imaging when metallic implants exist
+within the human body. CT metal artifacts arise from the drastic variation of
+metal's attenuation coefficients at various energy levels of the X-ray
+spectrum, leading to a nonlinear metal effect in CT measurements. Recovering CT
+images from metal-affected measurements hence poses a complicated nonlinear
+inverse problem where empirical models adopted in previous metal artifact
+reduction (MAR) approaches lead to signal loss and strongly aliased
+reconstructions. Polyner instead models the MAR problem from a nonlinear
+inverse problem perspective. Specifically, we first derive a polychromatic
+forward model to accurately simulate the nonlinear CT acquisition process.
+Then, we incorporate our forward model into the implicit neural representation
+to accomplish reconstruction. Lastly, we adopt a regularizer to preserve the
+physical properties of the CT images across different energy levels while
+effectively constraining the solution space. Our Polyner is an unsupervised
+method and does not require any external training data. Experimenting with
+multiple datasets shows that our Polyner achieves comparable or better
+performance than supervised methods on in-domain datasets while demonstrating
+significant performance improvements on out-of-domain datasets. To the best of
+our knowledge, our Polyner is the first unsupervised MAR method that
+outperforms its supervised counterparts. The code for this work is available
+at: https://github.com/iwuqing/Polyner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with
+  Diffusion Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17550v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17550v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenpeng Du, Qi Chen, Tianyu He, Xu Tan, Xie Chen, Kai Yu, Sheng Zhao, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent research has made significant progress in speech-driven talking
+face generation, the quality of the generated video still lags behind that of
+real recordings. One reason for this is the use of handcrafted intermediate
+representations like facial landmarks and 3DMM coefficients, which are designed
+based on human knowledge and are insufficient to precisely describe facial
+movements. Additionally, these methods require an external pretrained model for
+extracting these representations, whose performance sets an upper bound on
+talking face generation. To address these limitations, we propose a novel
+method called DAE-Talker that leverages data-driven latent representations
+obtained from a diffusion autoencoder (DAE). DAE contains an image encoder that
+encodes an image into a latent vector and a DDIM image decoder that
+reconstructs the image from it. We train our DAE on talking face video frames
+and then extract their latent representations as the training target for a
+Conformer-based speech2latent model. This allows DAE-Talker to synthesize full
+video frames and produce natural head movements that align with the content of
+speech, rather than relying on a predetermined head pose from a template video.
+We also introduce pose modelling in speech2latent for pose controllability.
+Additionally, we propose a novel method for generating continuous video frames
+with the DDIM image decoder trained on individual frames, eliminating the need
+for modelling the joint distribution of consecutive frames directly. Our
+experiments show that DAE-Talker outperforms existing popular methods in
+lip-sync, video fidelity, and pose naturalness. We also conduct ablation
+studies to analyze the effectiveness of the proposed techniques and demonstrate
+the pose controllability of DAE-Talker.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Transferable and Stealthy Adversarial Patch via
+  Attention-guided Adversarial Inpainting <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Li, Mingxing Duan, Xuelong Dai, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial patch attacks can fool the face recognition (FR) models via small
+patches. However, previous adversarial patch attacks often result in unnatural
+patterns that are easily noticeable. Generating transferable and stealthy
+adversarial patches that can efficiently deceive the black-box FR models while
+having good camouflage is challenging because of the huge stylistic difference
+between the source and target images. To generate transferable,
+natural-looking, and stealthy adversarial patches, we propose an innovative
+two-stage attack called Adv-Inpainting, which extracts style features and
+identity features from the attacker and target faces, respectively and then
+fills the patches with misleading and inconspicuous content guided by attention
+maps. In the first stage, we extract multi-scale style embeddings by a
+pyramid-like network and identity embeddings by a pretrained FR model and
+propose a novel Attention-guided Adaptive Instance Normalization layer (AAIN)
+to merge them via background-patch cross-attention maps. The proposed layer can
+adaptively fuse identity and style embeddings by fully exploiting priority
+contextual information. In the second stage, we design an Adversarial Patch
+Refinement Network (APR-Net) with a novel boundary variance loss, a spatial
+discounted reconstruction loss, and a perceptual loss to boost the stealthiness
+further. Experiments demonstrate that our attack can generate adversarial
+patches with improved visual quality, better stealthiness, and stronger
+transferability than state-of-the-art adversarial patch attacks and semantic
+attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Make Your Brief Stroke Real and Stereoscopic: 3D-Aware Simplified Sketch
+  to Portrait Generation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06857v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06857v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasheng Sun, Qianyi Wu, Hang Zhou, Kaisiyuan Wang, Tianshu Hu, Chen-Chieh Liao, Shio Miyafuji, Ziwei Liu, Hideki Koike
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating the photo-realistic version of people sketched portraits is useful
+to various entertainment purposes. Existing studies only generate portraits in
+the 2D plane with fixed views, making the results less vivid. In this paper, we
+present Stereoscopic Simplified Sketch-to-Portrait (SSSP), which explores the
+possibility of creating Stereoscopic 3D-aware portraits from simple contour
+sketches by involving 3D generative models. Our key insight is to design
+sketch-aware constraints that can fully exploit the prior knowledge of a
+tri-plane-based 3D-aware generative model. Specifically, our designed
+region-aware volume rendering strategy and global consistency constraint
+further enhance detail correspondences during sketch encoding. Moreover, in
+order to facilitate the usage of layman users, we propose a Contour-to-Sketch
+module with vector quantized representations, so that easily drawn contours can
+directly guide the generation of 3D portraits. Extensive comparisons show that
+our method generates high-quality results that match the sketch. Our usability
+study verifies that our system is greatly preferred by user.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page on https://hangz-nju-cuhk.github.io/projects/SSSP, Video
+  Url: https://youtu.be/GiOKbvr2U_E</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ L-MAE: Masked Autoencoders are Semantic Segmentation <span class="highlight-title">Dataset</span>s Augmenter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11242v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11242v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaru Jia, Mingzhe Liu, Jiake Xie, Xin Chen, Hong Zhang, Feixiang Zhao, Aiqing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating semantic segmentation datasets has consistently been laborious and
+time-consuming, particularly in the context of large models or specialized
+domains(i.e. Medical Imaging or Remote Sensing). Specifically, large models
+necessitate a substantial volume of data, while datasets in professional
+domains frequently require the involvement of domain experts. Both scenarios
+are susceptible to inaccurate data labeling, which can significantly affect the
+ultimate performance of the trained model. This paper proposes a simple and
+effective label pixel-level completion method, \textbf{Label Mask AutoEncoder}
+(L-MAE), which fully uses the existing information in the label to generate the
+complete label. The proposed model are the first to apply the Mask Auto-Encoder
+to downstream tasks. In detail, L-MAE adopts the fusion strategy that stacks
+the label and the corresponding image, namely fuse map. Moreover, since some of
+the image information is lost when masking the fuse map, direct reconstruction
+may lead to poor performance. We proposed Image Patch Supplement algorithm to
+supplement the missing information during the mask-reconstruct process, and
+empirically found that an average of 4.1\% mIoU can be improved.
+  We conducted a experiment to evaluate the efficacy of L-MAE to complete the
+dataset. We employed a degraded Pascal VOC dataset and the degraded dataset
+enhanced by L-MAE to train an identical conventional semantic segmentation
+model for the initial set of experiments. The results of these experiments
+demonstrate a performance enhancement of 13.5\% in the model trained with the
+L-MAE-enhanced dataset compared to the unenhanced dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physical Adversarial Attack meets Computer Vision: A Decade <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.15179v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.15179v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Wei, Hao Tang, Xuemei Jia, Zhixiang Wang, Hanxun Yu, Zhubo Li, Shin'ichi Satoh, Luc Van Gool, Zheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive achievements of Deep Neural Networks (DNNs) in
+computer vision, their vulnerability to adversarial attacks remains a critical
+concern. Extensive research has demonstrated that incorporating sophisticated
+perturbations into input images can lead to a catastrophic degradation in DNNs'
+performance. This perplexing phenomenon not only exists in the digital space
+but also in the physical world. Consequently, it becomes imperative to evaluate
+the security of DNNs-based systems to ensure their safe deployment in
+real-world scenarios, particularly in security-sensitive applications. To
+facilitate a profound understanding of this topic, this paper presents a
+comprehensive overview of physical adversarial attacks. Firstly, we distill
+four general steps for launching physical adversarial attacks. Building upon
+this foundation, we uncover the pervasive role of artifacts carrying
+adversarial perturbations in the physical world. These artifacts influence each
+step. To denote them, we introduce a new term: adversarial medium. Then, we
+take the first step to systematically evaluate the performance of physical
+adversarial attacks, taking the adversarial medium as a first attempt. Our
+proposed evaluation metric, hiPAA, comprises six perspectives: Effectiveness,
+Stealthiness, Robustness, Practicability, Aesthetics, and Economics. We also
+provide comparative results across task categories, together with insightful
+observations and suggestions for future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages. Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Robot 3D Perception in Urban Environments: The UT Campus
+  Object <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Zhang, Chaitanya Eranki, Christina Zhang, Ji-Hwan Park, Raymond Hong, Pranav Kalyani, Lochana Kalyanaraman, Arsh Gamare, Arnav Bagad, Maria Esteva, Joydeep Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the UT Campus Object Dataset (CODa), a mobile robot egocentric
+perception dataset collected on the University of Texas Austin Campus. Our
+dataset contains 8.5 hours of multimodal sensor data: synchronized 3D point
+clouds and stereo RGB video from a 128-channel 3D LiDAR and two 1.25MP RGB
+cameras at 10 fps; RGB-D videos from an additional 0.5MP sensor at 7 fps, and a
+9-DOF IMU sensor at 40 Hz. We provide 58 minutes of ground-truth annotations
+containing 1.3 million 3D bounding boxes with instance IDs for 53 semantic
+classes, 5000 frames of 3D semantic annotations for urban terrain, and
+pseudo-ground truth localization. We repeatedly traverse identical geographic
+locations for a wide range of indoor and outdoor areas, weather conditions, and
+times of the day. Using CODa, we empirically demonstrate that: 1) 3D object
+detection performance in urban settings is significantly higher when trained
+using CODa compared to existing datasets even when employing state-of-the-art
+domain adaptation approaches, 2) sensor-specific fine-tuning improves 3D object
+detection accuracy and 3) pretraining on CODa improves cross-dataset 3D object
+detection performance in urban settings compared to pretraining on AV datasets.
+Using our dataset and annotations, we release benchmarks for 3D object
+detection and 3D semantic segmentation using established metrics. In the
+future, the CODa benchmark will include additional tasks like unsupervised
+object discovery and re-identification. We publicly release CODa on the Texas
+Data Repository, pre-trained models, dataset development package, and
+interactive dataset viewer on our website at https://amrl.cs.utexas.edu/coda.
+We expect CODa to be a valuable dataset for research in egocentric 3D
+perception and planning for autonomous navigation in urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Persistent Homology Meets Object Unity: Object Recognition in Clutter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekta U. Samani, Ashis G. Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognition of occluded objects in unseen and unstructured indoor
+environments is a challenging problem for mobile robots. To address this
+challenge, we propose a new descriptor, TOPS, for point clouds generated from
+depth images and an accompanying recognition framework, THOR, inspired by human
+reasoning. The descriptor employs a novel slicing-based approach to compute
+topological features from filtrations of simplicial complexes using persistent
+homology, and facilitates reasoning-based recognition using object unity. Apart
+from a benchmark dataset, we report performance on a new dataset, the UW Indoor
+Scenes (UW-IS) Occluded dataset, curated using commodity hardware to reflect
+real-world scenarios with different environmental conditions and degrees of
+object occlusion. THOR outperforms state-of-the-art methods on both the
+datasets and achieves substantially higher recognition accuracy for all the
+scenarios of the UW-IS Occluded dataset. Therefore, THOR, is a promising step
+toward robust recognition in low-cost robots, meant for everyday use in indoor
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conditionally accepted for publication in the IEEE Transactions on
+  Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Debiased Mapping for Full-Reference Image Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11464v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11464v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoliang Chen, Hanwei Zhu, Lingyu Zhu, Shiqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mapping images to deep feature space for comparisons has been wildly adopted
+in recent learning-based full-reference image quality assessment (FR-IQA)
+models. Analogous to the classical classification task, the ideal mapping space
+for quality regression should possess both inter-class separability and
+intra-class compactness. The inter-class separability that focuses on the
+discrimination of images with different quality levels has been highly
+emphasized in existing models. However, the intra-class compactness that
+maintains small objective quality variance of images with the same or
+indistinguishable quality escapes the research attention, potentially leading
+to the perception-biased measures. In this paper, we reveal that such bias is
+mainly caused by the unsuitable subspace that the features are projected and
+compared in. To account for this, we develop the Debiased Mapping based quality
+Measure (DMM), which relies on the orthonormal bases of deep learning features
+formed by singular value decomposition (SVD). The SVD in deep learning feature
+domain, which overwhelmingly separates the quality variations with singular
+values and projection bases, facilitates the quality inference with dedicatedly
+designed distance measure. Experiments on different IQA databases demonstrate
+the mapping method is able to mitigate the perception bias efficiently, and the
+superior performance on quality prediction verifies the effectiveness of our
+method. The implementation will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Basis Angle Consistency in Sec.3.2 will be revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Training Objectives for Clarification Facet
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Ni, Keping Bi, Jiafeng Guo, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the ambiguity and vagueness of a user query, it is essential to
+identify the query facets for the clarification of user intents. Existing work
+on query facet generation has achieved compelling performance by sequentially
+predicting the next facet given previously generated facets based on
+pre-trained language generation models such as BART. Given a query, there are
+mainly two types of training objectives to guide the facet generation models.
+One is to generate the default sequence of ground-truth facets, and the other
+is to enumerate all the permutations of ground-truth facets and use the
+sequence that has the minimum loss for model updates. The second is
+permutation-invariant while the first is not. In this paper, we aim to conduct
+a systematic comparative study of various types of training objectives, with
+different properties of not only whether it is permutation-invariant but also
+whether it conducts sequential prediction and whether it can control the count
+of output facets. To this end, we propose another three training objectives of
+different aforementioned properties. For comprehensive comparisons, besides the
+commonly used evaluation that measures the matching with ground-truth facets,
+we also introduce two diversity metrics to measure the diversity of the
+generated facets. Based on an open-domain query facet dataset, i.e., MIMICS, we
+conduct extensive analyses and show the pros and cons of each method, which
+could shed light on model training for clarification facet generation. The code
+can be found at \url{https://github.com/ShiyuNee/Facet-Generation}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A General Offline Reinforcement Learning Framework for Interactive
+  Recommendation <span class="chip">AAAI2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Xiao, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of learning interactive recommender systems
+from logged feedbacks without any exploration in online environments. We
+address the problem by proposing a general offline reinforcement learning
+framework for recommendation, which enables maximizing cumulative user rewards
+without online exploration. Specifically, we first introduce a probabilistic
+generative model for interactive recommendation, and then propose an effective
+inference algorithm for discrete and stochastic policy learning based on logged
+feedbacks. In order to perform offline learning more effectively, we propose
+five approaches to minimize the distribution mismatch between the logging
+policy and recommendation policy: support constraints, supervised
+regularization, policy constraints, dual constraints and reward extrapolation.
+We conduct extensive experiments on two public real-world datasets,
+demonstrating that the proposed methods can achieve superior performance over
+existing supervised learning and reinforcement learning methods for
+recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Streamlining Attack Tree Generation: A Fragment-Based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irdin Pekaric, Markus Frick, Jubril Gbolahan Adigun, Raffaela Groner, Thomas Witte, Alexander Raschke, Michael Felderer, Matthias Tichy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attack graphs are a tool for analyzing security vulnerabilities that capture
+different and prospective attacks on a system. As a threat modeling tool, it
+shows possible paths that an attacker can exploit to achieve a particular goal.
+However, due to the large number of vulnerabilities that are published on a
+daily basis, they have the potential to rapidly expand in size. Consequently,
+this necessitates a significant amount of resources to generate attack graphs.
+In addition, generating composited attack models for complex systems such as
+self-adaptive or AI is very difficult due to their nature to continuously
+change. In this paper, we present a novel fragment-based attack graph
+generation approach that utilizes information from publicly available
+information security databases. Furthermore, we also propose a domain-specific
+language for attack modeling, which we employ in the proposed attack graph
+generation approach. Finally, we present a demonstrator example showcasing the
+attack generator's capability to replicate a verified attack chain, as
+previously confirmed by security experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 57th Hawaii International Conference on Social
+  Systems (HICSS-57), Honolulu, Hawaii. 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TDCGL: Two-Level Debiased Contrastive Graph Learning for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00569v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00569v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubo Gao, Haotian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  knowledge graph-based recommendation methods have achieved great success in
+the field of recommender systems. However, over-reliance on high-quality
+knowledge graphs is a bottleneck for such methods. Specifically, the
+long-tailed distribution of entities of KG and noise issues in the real world
+will make item-entity dependent relations deviate from reflecting true
+characteristics and significantly harm the performance of modeling user
+preference. Contrastive learning, as a novel method that is employed for data
+augmentation and denoising, provides inspiration to fill this research gap.
+However, the mainstream work only focuses on the long-tail properties of the
+number of items clicked, while ignoring that the long-tail properties of total
+number of clicks per user may also affect the performance of the recommendation
+model. Therefore, to tackle these problems, motivated by the Debiased
+Contrastive Learning of Unsupervised Sentence Representations (DCLR), we
+propose Two-Level Debiased Contrastive Graph Learning (TDCGL) model.
+Specifically, we design the Two-Level Debiased Contrastive Learning (TDCL) and
+deploy it in the KG, which is conducted not only on User-Item pairs but also on
+User-User pairs for modeling higher-order relations. Also, to reduce the bias
+caused by random sampling in contrastive learning, with the exception of the
+negative samples obtained by random sampling, we add a noise-based generation
+of negation to ensure spatial uniformity. Considerable experiments on
+open-source datasets demonstrate that our method has excellent anti-noise
+capability and significantly outperforms state-of-the-art baselines. In
+addition, ablation studies about the necessity for each level of TDCL are
+conducted.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NS4AR: A new, focused on sampling areas sampling method in graphical
+  recommendation Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangqi Wang, Dilinuer Aishan, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of graphical recommender system depends on the quantity and
+quality of negative sampling. This paper selects some typical recommender
+system models, as well as some latest negative sampling strategies on the
+models as baseline. Based on typical graphical recommender model, we divide
+sample region into assigned-n areas and use AdaSim to give different weight to
+these areas to form positive set and negative set. Because of the volume and
+significance of negative items, we also proposed a subset selection model to
+narrow the core negative samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>None</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Recommendations with <span class="highlight-title">Pre-Train</span>ed Large Language Models for
+  Multimodal Nudging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachel M. Harrison, Anton Dereventsov, Anton Bibin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for zero-shot recommendation of multimodal non-stationary
+content that leverages recent advancements in the field of generative AI. We
+propose rendering inputs of different modalities as textual descriptions and to
+utilize pre-trained LLMs to obtain their numerical representations by computing
+semantic embeddings. Once unified representations of all content items are
+obtained, the recommendation can be performed by computing an appropriate
+similarity metric between them without any additional learning. We demonstrate
+our approach on a synthetic multimodal nudging environment, where the inputs
+consist of tabular, textual, and visual data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">49</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlocking Tuning-free Generalization: Minimizing the PAC-Bayes Bound
+  with Trainable Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xitong Zhang, Avrajit Ghosh, Guangliang Liu, Rongrong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is widely recognized that the generalization ability of neural networks
+can be greatly enhanced through carefully designing the training procedure. The
+current state-of-the-art training approach involves utilizing stochastic
+gradient descent (SGD) or Adam optimization algorithms along with a combination
+of additional regularization techniques such as weight decay, dropout, or noise
+injection. Optimal generalization can only be achieved by tuning a multitude of
+hyperparameters through grid search, which can be time-consuming and
+necessitates additional validation datasets. To address this issue, we
+introduce a practical PAC-Bayes training framework that is nearly tuning-free
+and requires no additional regularization while achieving comparable testing
+performance to that of SGD/Adam after a complete grid search and with extra
+regularizations. Our proposed algorithm demonstrates the remarkable potential
+of PAC training to achieve state-of-the-art performance on deep neural networks
+with enhanced robustness and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 15 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Implicit Bias of Adam 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matias D. Cattaneo, Jason M. Klusowski, Boris Shigida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In previous literature, backward error analysis was used to find ordinary
+differential equations (ODEs) approximating the gradient descent trajectory. It
+was found that finite step sizes implicitly regularize solutions because terms
+appearing in the ODEs penalize the two-norm of the loss gradients. We prove
+that the existence of similar implicit regularization in RMSProp and Adam
+depends on their hyperparameters and the training stage, but with a different
+"norm" involved: the corresponding ODE terms either penalize the (perturbed)
+one-norm of the loss gradients or, on the contrary, hinder its decrease (the
+latter case being typical). We also conduct numerical experiments and discuss
+how the proven facts can influence generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Diffusion Models with Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13301v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13301v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Black, Michael Janner, Yilun Du, Ilya Kostrikov, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are a class of flexible generative models trained with an
+approximation to the log-likelihood objective. However, most use cases of
+diffusion models are not concerned with likelihoods, but instead with
+downstream objectives such as human-perceived image quality or drug
+effectiveness. In this paper, we investigate reinforcement learning methods for
+directly optimizing diffusion models for such objectives. We describe how
+posing denoising as a multi-step decision-making problem enables a class of
+policy gradient algorithms, which we refer to as denoising diffusion policy
+optimization (DDPO), that are more effective than alternative reward-weighted
+likelihood approaches. Empirically, DDPO is able to adapt text-to-image
+diffusion models to objectives that are difficult to express via prompting,
+such as image compressibility, and those derived from human feedback, such as
+aesthetic quality. Finally, we show that DDPO can improve prompt-image
+alignment using feedback from a vision-language model without the need for
+additional data collection or human annotation. The project's website can be
+found at http://rl-diffusion.github.io .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Holistic Evaluation of Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, Benjamin Newman, Binhang Yuan, Bobby Yan, Ce Zhang, Christian Cosgrove, Christopher D. Manning, Christopher Ré, Diana Acosta-Navas, Drew A. Hudson, Eric Zelikman, Esin Durmus, Faisal Ladhak, Frieda Rong, Hongyu Ren, Huaxiu Yao, Jue Wang, Keshav Santhanam, Laurel Orr, Lucia Zheng, Mert Yuksekgonul, Mirac Suzgun, Nathan Kim, Neel Guha, Niladri Chatterji, Omar Khattab, Peter Henderson, Qian Huang, Ryan Chi, Sang Michael Xie, Shibani Santurkar, Surya Ganguli, Tatsunori Hashimoto, Thomas Icard, Tianyi Zhang, Vishrav Chaudhary, William Wang, Xuechen Li, Yifan Mai, Yuhui Zhang, Yuta Koreeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs) are becoming the foundation for almost all major
+language technologies, but their capabilities, limitations, and risks are not
+well understood. We present Holistic Evaluation of Language Models (HELM) to
+improve the transparency of language models. First, we taxonomize the vast
+space of potential scenarios (i.e. use cases) and metrics (i.e. desiderata)
+that are of interest for LMs. Then we select a broad subset based on coverage
+and feasibility, noting what's missing or underrepresented (e.g. question
+answering for neglected English dialects, metrics for trustworthiness). Second,
+we adopt a multi-metric approach: We measure 7 metrics (accuracy, calibration,
+robustness, fairness, bias, toxicity, and efficiency) for each of 16 core
+scenarios when possible (87.5% of the time). This ensures metrics beyond
+accuracy don't fall to the wayside, and that trade-offs are clearly exposed. We
+also perform 7 targeted evaluations, based on 26 targeted scenarios, to analyze
+specific aspects (e.g. reasoning, disinformation). Third, we conduct a
+large-scale evaluation of 30 prominent language models (spanning open,
+limited-access, and closed models) on all 42 scenarios, 21 of which were not
+previously used in mainstream LM evaluation. Prior to HELM, models on average
+were evaluated on just 17.9% of the core HELM scenarios, with some prominent
+models not sharing a single scenario in common. We improve this to 96.0%: now
+all 30 models have been densely benchmarked on the same core scenarios and
+metrics under standardized conditions. Our evaluation surfaces 25 top-level
+findings. For full transparency, we release all raw model prompts and
+completions publicly for further analysis, as well as a general modular
+toolkit. We intend for HELM to be a living benchmark for the community,
+continuously updated with new scenarios, metrics, and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Authored by the Center for Research on Foundation Models (CRFM) at
+  the Stanford Institute for Human-Centered Artificial Intelligence (HAI).
+  Project page: https://crfm.stanford.edu/helm/v1.0</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEENN: Towards Temporal Spiking Early-Exit Neural Networks <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01230v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01230v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Li, Tamar Geller, Youngeun Kim, Priyadarshini Panda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) have recently become more popular as a
+biologically plausible substitute for traditional Artificial Neural Networks
+(ANNs). SNNs are cost-efficient and deployment-friendly because they process
+input in both spatial and temporal manner using binary spikes. However, we
+observe that the information capacity in SNNs is affected by the number of
+timesteps, leading to an accuracy-efficiency tradeoff. In this work, we study a
+fine-grained adjustment of the number of timesteps in SNNs. Specifically, we
+treat the number of timesteps as a variable conditioned on different input
+samples to reduce redundant timesteps for certain data. We call our method
+Spiking Early-Exit Neural Networks (SEENNs). To determine the appropriate
+number of timesteps, we propose SEENN-I which uses a confidence score
+thresholding to filter out the uncertain predictions, and SEENN-II which
+determines the number of timesteps by reinforcement learning. Moreover, we
+demonstrate that SEENN is compatible with both the directly trained SNN and the
+ANN-SNN conversion. By dynamically adjusting the number of timesteps, our SEENN
+achieves a remarkable reduction in the average number of timesteps during
+inference. For example, our SEENN-II ResNet-19 can achieve 96.1% accuracy with
+an average of 1.08 timesteps on the CIFAR-10 test dataset. Code is shared at
+https://github.com/Intelligent-Computing-Lab-Yale/SEENN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dataflow Analysis-Inspired Deep Learning for Efficient Vulnerability
+  Detection <span class="chip">ICSE 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Steenhoek, Hongyang Gao, Wei Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based vulnerability detection has shown great performance and,
+in some studies, outperformed static analysis tools. However, the
+highest-performing approaches use token-based transformer models, which are not
+the most efficient to capture code semantics required for vulnerability
+detection. Classical program analysis techniques such as dataflow analysis can
+detect many types of bugs based on their root causes. In this paper, we propose
+to combine such causal-based vulnerability detection algorithms with deep
+learning, aiming to achieve more efficient and effective vulnerability
+detection. Specifically, we designed DeepDFA, a dataflow analysis-inspired
+graph learning framework and an embedding technique that enables graph learning
+to simulate dataflow computation. We show that DeepDFA is both performant and
+efficient. DeepDFA outperformed all non-transformer baselines. It was trained
+in 9 minutes, 75x faster than the highest-performing baseline model. When using
+only 50+ vulnerable and several hundreds of total examples as training data,
+the model retained the same performance as 100% of the dataset. DeepDFA also
+generalized to real-world vulnerabilities in DbgBench; it detected 8.7 out of
+17 vulnerabilities on average across folds and was able to distinguish between
+patched and buggy versions, while the highest-performing baseline models did
+not detect any vulnerabilities. By combining DeepDFA with a large language
+model, we surpassed the state-of-the-art vulnerability detection performance on
+the Big-Vul dataset with 96.46 F1 score, 97.82 precision, and 95.14 recall. Our
+replication package is located at https://doi.org/10.6084/m9.figshare.21225413 .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICSE 2024 (Early Cycle). Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ General Anomaly Detection of Underwater Gliders Validated by Large-scale
+  Deployment <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruochu Yang, Chad Lembke, Fumin Zhang, Catherine Edwards
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater gliders have been widely used in oceanography for a range of
+applications. However, unpredictable events like shark strike or remora
+attachment can lead to abnormal glider behavior or even loss of the glider.
+This paper employs an anomaly detection algorithm to assess operational
+conditions of underwater gliders in the ocean environment. Prompt alerts are
+provided to glider pilots upon detecting any anomaly, so that they can take
+control of the glider to prevent further harm. The detection algorithm is
+applied to abundant datasets collected in real glider deployments led by the
+Skidaway Institute of Oceanography (SkIO) in the University of Georgia and the
+University of South Florida (USF). In order to demonstrate generality, the
+experimental evaluation is applied to four glider deployment datasets.
+Specifically, we utilize post-recovery DBD datasets carrying high-resolution
+information to perform detailed analysis of the anomaly and compare it with
+pilot logs. Additionally, we implement the online detection based on the
+real-time subsets of data transmitted from the glider at the surfacing events.
+While the real-time glider data may not contain as much rich information as the
+post-recovery one, the online detection is of great importance as it allows
+glider pilots to monitor potential abnormal conditions in real time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE/MTS OCEANS Gulf Coast 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning invariant representations of time-homogeneous stochastic
+  dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir R. Kostic, Pietro Novelli, Riccardo Grazzi, Karim Lounici, Massimiliano Pontil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the general class of time-homogeneous stochastic dynamical
+systems, both discrete and continuous, and study the problem of learning a
+representation of the state that faithfully captures its dynamics. This is
+instrumental to learn the transfer operator of the system, that in turn can be
+used for numerous tasks, such as forecasting and interpreting the system
+dynamics. We show that the search for a good representation can be cast as an
+optimization problem over neural networks. Our approach is supported by recent
+results in statistical learning theory, highlighting the role of approximation
+error and metric distortion in the context of transfer operator regression. The
+objective function we propose is associated with projection operators from the
+representation space to the data space, overcomes metric distortion, and can be
+empirically estimated from data. In the discrete time setting, we further
+derive a relaxed objective function that is differentiable and numerically
+well-conditioned. We compare our method against state-of-the-art approaches on
+different datasets, showing better performance across the board.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Noise Stability Optimization for Flat Minima with Tight Rates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08553v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08553v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Ju, Dongyue Li, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalization properties are a central aspect of the design and analysis of
+learning algorithms. One notion that has been considered in many previous works
+as leading to good generalization is flat minima, which informally describes a
+loss surface that is insensitive to noise perturbations. However, the design of
+efficient algorithms (that are easy to analyze) to find them is relatively
+under-explored. In this paper, we propose a new algorithm to address this
+issue, which minimizes a stochastic optimization objective that averages noise
+perturbations injected into the weights of a function. This algorithm is shown
+to enjoy both theoretical and empirical advantages compared to existing
+algorithms involving worst-case perturbations. Theoretically, we show tight
+convergence rates of our algorithm to find first-order stationary points of the
+stochastic objective. Empirically, the algorithm induces a penalty on the trace
+of the Hessian, leading to iterates that are flatter than SGD and other
+alternatives, with tighter generalization gaps. Altogether, this work
+contributes a provable and practical algorithm to find flat minima by
+optimizing the noise stability properties of a function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Rates for Bandit Nonstochastic Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15352v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15352v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Y. Jennifer Sun, Stephen Newman, Elad Hazan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear Quadratic Regulator (LQR) and Linear Quadratic Gaussian (LQG) control
+are foundational and extensively researched problems in optimal control. We
+investigate LQR and LQG problems with semi-adversarial perturbations and
+time-varying adversarial bandit loss functions. The best-known sublinear regret
+algorithm of~\cite{gradu2020non} has a $T^{\frac{3}{4}}$ time horizon
+dependence, and its authors posed an open question about whether a tight rate
+of $\sqrt{T}$ could be achieved. We answer in the affirmative, giving an
+algorithm for bandit LQR and LQG which attains optimal regret (up to
+logarithmic factors) for both known and unknown systems. A central component of
+our method is a new scheme for bandit convex optimization with memory, which is
+of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FUTURE-AI: Guiding Principles and Consensus Recommendations for
+  Trustworthy Artificial Intelligence in Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.09658v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.09658v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Lekadir, Richard Osuala, Catherine Gallin, Noussair Lazrak, Kaisar Kushibar, Gianna Tsakou, Susanna Aussó, Leonor Cerdá Alberich, Kostas Marias, Manolis Tsiknakis, Sara Colantonio, Nickolas Papanikolaou, Zohaib Salahuddin, Henry C Woodruff, Philippe Lambin, Luis Martí-Bonmatí
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancements in artificial intelligence (AI) combined with the
+extensive amount of data generated by today's clinical systems, has led to the
+development of imaging AI solutions across the whole value chain of medical
+imaging, including image reconstruction, medical image segmentation,
+image-based diagnosis and treatment planning. Notwithstanding the successes and
+future potential of AI in medical imaging, many stakeholders are concerned of
+the potential risks and ethical implications of imaging AI solutions, which are
+perceived as complex, opaque, and difficult to comprehend, utilise, and trust
+in critical clinical applications. Despite these concerns and risks, there are
+currently no concrete guidelines and best practices for guiding future AI
+developments in medical imaging towards increased trust, safety and adoption.
+To bridge this gap, this paper introduces a careful selection of guiding
+principles drawn from the accumulated experiences, consensus, and best
+practices from five large European projects on AI in Health Imaging. These
+guiding principles are named FUTURE-AI and its building blocks consist of (i)
+Fairness, (ii) Universality, (iii) Traceability, (iv) Usability, (v) Robustness
+and (vi) Explainability. In a step-by-step approach, these guidelines are
+further translated into a framework of concrete recommendations for specifying,
+developing, evaluating, and deploying technically, clinically and ethically
+trustworthy AI solutions into clinical practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please refer to arXiv:2309.12325 for the latest FUTURE-AI framework
+  for healthcare</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPRINT: Scalable Policy <span class="highlight-title">Pre-Train</span>ing via Language Instruction Relabeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11886v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11886v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse Zhang, Karl Pertsch, Jiahui Zhang, Joseph J. Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training robot policies with a rich set of skills can substantially
+accelerate the learning of downstream tasks. Prior works have defined
+pre-training tasks via natural language instructions, but doing so requires
+tedious human annotation of hundreds of thousands of instructions. Thus, we
+propose SPRINT, a scalable offline policy pre-training approach which
+substantially reduces the human effort needed for pre-training a diverse set of
+skills. Our method uses two core ideas to automatically expand a base set of
+pre-training tasks: instruction relabeling via large language models and
+cross-trajectory skill chaining through offline reinforcement learning. As a
+result, SPRINT pre-training equips robots with a much richer repertoire of
+skills. Experimental results in a household simulator and on a real robot
+kitchen manipulation task show that SPRINT leads to substantially faster
+learning of new long-horizon tasks than previous pre-training approaches.
+Website at https://clvrai.com/sprint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding the Difficulty of Training <span class="highlight-title">Transformer</span>s <span class="chip">EMNLP 2020</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2004.08249v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2004.08249v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Liu, Xiaodong Liu, Jianfeng Gao, Weizhu Chen, Jiawei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have proved effective in many NLP tasks. However, their training
+requires non-trivial efforts regarding designing cutting-edge optimizers and
+learning rate schedulers carefully (e.g., conventional SGD fails to train
+Transformers effectively). Our objective here is to understand $\textit{what
+complicates Transformer training}$ from both empirical and theoretical
+perspectives. Our analysis reveals that unbalanced gradients are not the root
+cause of the instability of training. Instead, we identify an amplification
+effect that influences training substantially -- for each layer in a
+multi-layer Transformer model, heavy dependency on its residual branch makes
+training unstable, since it amplifies small parameter perturbations (e.g.,
+parameter updates) and results in significant disturbances in the model output.
+Yet we observe that a light dependency limits the model potential and leads to
+inferior trained models. Inspired by our analysis, we propose Admin
+($\textbf{Ad}$aptive $\textbf{m}$odel $\textbf{in}$itialization) to stabilize
+stabilize the early stage's training and unleash its full potential in the late
+stage. Extensive experiments show that Admin is more stable, converges faster,
+and leads to better performance. Implementations are released at:
+https://github.com/LiyuanLucasLiu/Transforemr-Clinic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Faster Non-Asymptotic Convergence for Diffusion-Based Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gen Li, Yuting Wei, Yuxin Chen, Yuejie Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models, which convert noise into new data instances by learning to
+reverse a Markov diffusion process, have become a cornerstone in contemporary
+generative modeling. While their practical power has now been widely
+recognized, the theoretical underpinnings remain far from mature. In this work,
+we develop a suite of non-asymptotic theory towards understanding the data
+generation process of diffusion models in discrete time, assuming access to
+$\ell_2$-accurate estimates of the (Stein) score functions. For a popular
+deterministic sampler (based on the probability flow ODE), we establish a
+convergence rate proportional to $1/T$ (with $T$ the total number of steps),
+improving upon past results; for another mainstream stochastic sampler (i.e., a
+type of the denoising diffusion probabilistic model), we derive a convergence
+rate proportional to $1/\sqrt{T}$, matching the state-of-the-art theory.
+Imposing only minimal assumptions on the target data distribution (e.g., no
+smoothness assumption is imposed), our results characterize how $\ell_2$ score
+estimation errors affect the quality of the data generation processes. In
+contrast to prior works, our theory is developed based on an elementary yet
+versatile non-asymptotic approach without resorting to toolboxes for SDEs and
+ODEs. Further, we design two accelerated variants, improving the convergence to
+$1/T^2$ for the ODE-based sampler and $1/T$ for the DDPM-type sampler, which
+might be of independent theoretical and empirical interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Score estimation errors are included in the convergence theory in the
+  new version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Train Hard, Fight Easy: Robust Meta Reinforcement Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11147v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11147v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ido Greenberg, Shie Mannor, Gal Chechik, Eli Meirom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge of reinforcement learning (RL) in real-world applications
+is the variation between environments, tasks or clients. Meta-RL (MRL)
+addresses this issue by learning a meta-policy that adapts to new tasks.
+Standard MRL methods optimize the average return over tasks, but often suffer
+from poor results in tasks of high risk or difficulty. This limits system
+reliability since test tasks are not known in advance. In this work, we define
+a robust MRL objective with a controlled robustness level. Optimization of
+analogous robust objectives in RL is known to lead to both *biased gradients*
+and *data inefficiency*. We prove that the gradient bias disappears in our
+proposed MRL framework. The data inefficiency is addressed via the novel Robust
+Meta RL algorithm (RoML). RoML is a meta-algorithm that generates a robust
+version of any given MRL algorithm, by identifying and over-sampling harder
+tasks throughout training. We demonstrate that RoML achieves robust returns on
+multiple navigation and continuous control benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Imbalanced Regression: Fair Uncertainty Quantification via
+  Probabilistic Smoothing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan Wang, Hao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing regression models tend to fall short in both accuracy and
+uncertainty estimation when the label distribution is imbalanced. In this
+paper, we propose a probabilistic deep learning model, dubbed variational
+imbalanced regression (VIR), which not only performs well in imbalanced
+regression but naturally produces reasonable uncertainty estimation as a
+byproduct. Different from typical variational autoencoders assuming I.I.D.
+representations (a data point's representation is not directly affected by
+other data points), our VIR borrows data with similar regression labels to
+compute the latent representation's variational distribution; furthermore,
+different from deterministic regression models producing point estimates, VIR
+predicts the entire normal-inverse-gamma distributions and modulates the
+associated conjugate distributions to impose probabilistic reweighting on the
+imbalanced data, thereby providing better uncertainty estimation. Experiments
+in several real-world datasets show that our VIR can outperform
+state-of-the-art imbalanced regression models in terms of both accuracy and
+uncertainty estimation. Code will soon be available at
+\url{https://github.com/Wang-ML-Lab/variational-imbalanced-regression}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OKRidge: Scalable Optimal k-Sparse Ridge Regression <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06686v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06686v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachang Liu, Sam Rosen, Chudi Zhong, Cynthia Rudin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider an important problem in scientific discovery, namely identifying
+sparse governing equations for nonlinear dynamical systems. This involves
+solving sparse ridge regression problems to provable optimality in order to
+determine which terms drive the underlying dynamics. We propose a fast
+algorithm, OKRidge, for sparse ridge regression, using a novel lower bound
+calculation involving, first, a saddle point formulation, and from there,
+either solving (i) a linear system or (ii) using an ADMM-based approach, where
+the proximal operators can be efficiently evaluated by solving another linear
+system and an isotonic regression problem. We also propose a method to
+warm-start our solver, which leverages a beam search. Experimentally, our
+methods attain provable optimality with run times that are orders of magnitude
+faster than those of the existing MIP formulations solved by the commercial
+solver Gurobi.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023, pre camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Silent Killer: A Stealthy, Clean-Label, Black-Box Backdoor Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.02615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.02615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tzvi Lederer, Gallil Maimon, Lior Rokach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor poisoning attacks pose a well-known risk to neural networks.
+However, most studies have focused on lenient threat models. We introduce
+Silent Killer, a novel attack that operates in clean-label, black-box settings,
+uses a stealthy poison and trigger and outperforms existing methods. We
+investigate the use of universal adversarial perturbations as triggers in
+clean-label attacks, following the success of such approaches under
+poison-label settings. We analyze the success of a naive adaptation and find
+that gradient alignment for crafting the poison is required to ensure high
+success rates. We conduct thorough experiments on MNIST, CIFAR10, and a reduced
+version of ImageNet and achieve state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Learning with Uncertainty via Distilled Predictive
+  Distributions <span class="chip">ACML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07562v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07562v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shrey Bhatt, Aishwarya Gupta, Piyush Rai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing federated learning methods are unable to estimate
+model/predictive uncertainty since the client models are trained using the
+standard loss function minimization approach which ignores such uncertainties.
+In many situations, however, especially in limited data settings, it is
+beneficial to take into account the uncertainty in the model parameters at each
+client as it leads to more accurate predictions and also because reliable
+estimates of uncertainty can be used for tasks, such as out-of-distribution
+(OOD) detection, and sequential decision-making tasks, such as active learning.
+We present a framework for federated learning with uncertainty where, in each
+round, each client infers the posterior distribution over its parameters as
+well as the posterior predictive distribution (PPD), distills the PPD into a
+single deep neural network, and sends this network to the server. Unlike some
+of the recent Bayesian approaches to federated learning, our approach does not
+require sending the whole posterior distribution of the parameters from each
+client to the server but only the PPD in the distilled form as a deep neural
+network. In addition, when making predictions at test time, it does not require
+computationally expensive Monte-Carlo averaging over the posterior distribution
+because our approach always maintains the PPD in the form of a single deep
+neural network. Moreover, our approach does not make any restrictive
+assumptions, such as the form of the clients' posterior distributions, or of
+their PPDs. We evaluate our approach on classification in federated setting, as
+well as active learning and OOD detection in federated settings, on which our
+approach outperforms various existing federated learning baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACML 2023; 21 pages(14 pages of main content, 2 pages of
+  references, and 5 pages of supplementary content)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Optical Control Environment for Benchmarking Reinforcement Learning
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.12114v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.12114v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abulikemu Abuduweili, Changliu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement learning has the potential to address various scientific
+problems. In this paper, we implement an optics simulation environment for
+reinforcement learning based controllers. The environment captures the essence
+of nonconvexity, nonlinearity, and time-dependent noise inherent in optical
+systems, offering a more realistic setting. Subsequently, we provide the
+benchmark results of several reinforcement learning algorithms on the proposed
+simulation environment. The experimental findings demonstrate the superiority
+of off-policy reinforcement learning approaches over traditional control
+algorithms in navigating the intricacies of complex optical control
+environments. The code of the paper is available at
+https://github.com/Walleclipse/Reinforcement-Learning-Pulse-Stacking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ada-QPacknet -- adaptive pruning with bit width reduction as an
+  efficient continual learning method without forgetting <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07939v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07939v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Pietroń, Dominik Żurek, Kamil Faber, Roberto Corizzo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Learning (CL) is a process in which there is still huge gap between
+human and deep learning model efficiency. Recently, many CL algorithms were
+designed. Most of them have many problems with learning in dynamic and complex
+environments. In this work new architecture based approach Ada-QPacknet is
+described. It incorporates the pruning for extracting the sub-network for each
+task. The crucial aspect in architecture based CL methods is theirs capacity.
+In presented method the size of the model is reduced by efficient linear and
+nonlinear quantisation approach. The method reduces the bit-width of the
+weights format. The presented results shows that low bit quantisation achieves
+similar accuracy as floating-point sub-network on a well-know CL scenarios. To
+our knowledge it is the first CL strategy which incorporates both compression
+techniques pruning and quantisation for generating task sub-networks. The
+presented algorithm was tested on well-known episode combinations and compared
+with most popular algorithms. Results show that proposed approach outperforms
+most of the CL strategies in task and class incremental scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMASD: A Multimodal <span class="highlight-title">Dataset</span> for Autism Intervention Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08243v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08243v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jicheng Li, Vuthea Chheang, Pinar Kullu, Eli Brignac, Zhang Guo, Kenneth E. Barner, Anjana Bhat, Roghayeh Leila Barmaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is a developmental disorder characterized by
+significant social communication impairments and difficulties perceiving and
+presenting communication cues. Machine learning techniques have been broadly
+adopted to facilitate autism studies and assessments. However, computational
+models are primarily concentrated on specific analysis and validated on private
+datasets in the autism community, which limits comparisons across models due to
+privacy-preserving data sharing complications. This work presents a novel
+privacy-preserving open-source dataset, MMASD as a MultiModal ASD benchmark
+dataset, collected from play therapy interventions of children with Autism.
+MMASD includes data from 32 children with ASD, and 1,315 data samples segmented
+from over 100 hours of intervention recordings. To promote public access, each
+data sample consists of four privacy-preserving modalities of data; some of
+which are derived from original videos: (1) optical flow, (2) 2D skeleton, (3)
+3D skeleton, and (4) clinician ASD evaluation scores of children, e.g., ADOS
+scores. MMASD aims to assist researchers and therapists in understanding
+children's cognitive status, monitoring their progress during therapy, and
+customizing the treatment plan accordingly. It also has inspiration for
+downstream tasks such as action quality assessment and interpersonal synchrony
+estimation. MMASD dataset can be easily accessed at
+https://github.com/Li-Jicheng/MMASD-A-Multimodal-Dataset-for-Autism-Intervention-Analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstructing Human Expressiveness in Piano Performances with a
+  <span class="highlight-title">Transformer</span> Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingjing Tang, Geraint Wiggins, Gyorgy Fazekas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing intricate and subtle variations in human expressiveness in music
+performance using computational approaches is challenging. In this paper, we
+propose a novel approach for reconstructing human expressiveness in piano
+performance with a multi-layer bi-directional Transformer encoder. To address
+the needs for large amounts of accurately captured and score-aligned
+performance data in training neural networks, we use transcribed scores
+obtained from an existing transcription model to train our model. We integrate
+pianist identities to control the sampling process and explore the ability of
+our system to model variations in expressiveness for different pianists. The
+system is evaluated through statistical analysis of generated expressive
+performances and a listening test. Overall, the results suggest that our method
+achieves state-of-the-art in generating human-like piano performances from
+transcribed scores, while fully and consistently reconstructing human
+expressiveness poses further challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, accepted by CMMR2023, the 16th International
+  Symposium on Computer Music Multidisciplinary Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DreamDecompiler: Bayesian Program Learning by Decompiling Amortised
+  Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro B. Palmarini, Christopher G. Lucas, N. Siddharth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving program induction problems requires searching through an enormous
+space of possibilities. DreamCoder is an inductive program synthesis system
+that, whilst solving problems, learns to simplify search in an iterative
+wake-sleep procedure. The cost of search is amortised by training a neural
+search policy, reducing search breadth and effectively "compiling" useful
+information to compose program solutions across tasks. Additionally, a library
+of program components is learnt to express discovered solutions in fewer
+components, reducing search depth. In DreamCoder, the neural search policy has
+only an indirect effect on the library learnt through the program solutions it
+helps discover. We present an approach for library learning that directly
+leverages the neural search policy, effectively "decompiling" its amortised
+knowledge to extract relevant program components. This provides stronger
+amortised inference: the amortised knowledge learnt to reduce search breadth is
+now also used to reduce search depth. We integrate our approach with DreamCoder
+and demonstrate faster domain proficiency with improved generalisation on a
+range of domains, particularly when fewer example solutions are available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class Incremental Learning via Likelihood Ratio Based Task Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Lin, Yijia Shao, Weinan Qian, Ningxin Pan, Yiduo Guo, Bing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class incremental learning (CIL) is a challenging setting of continual
+learning, which learns a series of tasks sequentially. Each task consists of a
+set of unique classes. The key feature of CIL is that no task identifier (or
+task-id) is provided at test time for each test sample. Predicting the task-id
+for each test sample is a challenging problem. An emerging theoretically
+justified and effective approach is to train a task-specific model for each
+task in a shared network for all tasks based on a task-incremental learning
+(TIL) method to deal with forgetting. The model for each task in this approach
+is an out-of-distribution (OOD) detector rather than a conventional classifier.
+The OOD detector can perform both within-task (in-distribution (IND)) class
+prediction and OOD detection. The OOD detection capability is the key for
+task-id prediction during inference for each test sample. However, this paper
+argues that using a traditional OOD detector for task-id prediction is
+sub-optimal because additional information (e.g., the replay data and the
+learned tasks) available in CIL can be exploited to design a better and
+principled method for task-id prediction. We call the new method TPLR (Task-id
+Prediction based on Likelihood Ratio}). TPLR markedly outperforms strong CIL
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asymptotically Efficient Online Learning for Censored Regression Models
+  Under Non-I.I.D Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lantian Zhang, Lei Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The asymptotically efficient online learning problem is investigated for
+stochastic censored regression models, which arise from various fields of
+learning and statistics but up to now still lacks comprehensive theoretical
+studies on the efficiency of the learning algorithms. For this, we propose a
+two-step online algorithm, where the first step focuses on achieving algorithm
+convergence, and the second step is dedicated to improving the estimation
+performance. Under a general excitation condition on the data, we show that our
+algorithm is strongly consistent and asymptotically normal by employing the
+stochastic Lyapunov function method and limit theories for martingales.
+Moreover, we show that the covariances of the estimates can achieve the
+Cramer-Rao (C-R) bound asymptotically, indicating that the performance of the
+proposed algorithm is the best possible that one can expect in general. Unlike
+most of the existing works, our results are obtained without resorting to the
+traditionally used but stringent conditions such as independent and identically
+distributed (i.i.d) assumption on the data, and thus our results do not exclude
+applications to stochastic dynamical systems with feedback. A numerical example
+is also provided to illustrate the superiority of the proposed online algorithm
+over the existing related ones in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Robustness of AI Offensive Code Generators via Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristina Improta, Pietro Liguori, Roberto Natella, Bojan Cukic, Domenico Cotroneo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present a method to add perturbations to the code
+descriptions to create new inputs in natural language (NL) from
+well-intentioned developers that diverge from the original ones due to the use
+of new words or because they miss part of them. The goal is to analyze how and
+to what extent perturbations affect the performance of AI code generators in
+the context of security-oriented code. First, we show that perturbed
+descriptions preserve the semantics of the original, non-perturbed ones. Then,
+we use the method to assess the robustness of three state-of-the-art code
+generators against the newly perturbed inputs, showing that the performance of
+these AI-based solutions is highly affected by perturbations in the NL
+descriptions. To enhance their robustness, we use the method to perform data
+augmentation, i.e., to increase the variability and diversity of the NL
+descriptions in the training data, proving its effectiveness against both
+perturbed and non-perturbed code descriptions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Building and Road Segmentation Using EffUNet and Transfer Learning
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahil Gangurde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In city, information about urban objects such as water supply, railway lines,
+power lines, buildings, roads, etc., is necessary for city planning. In
+particular, information about the spread of these objects, locations and
+capacity is needed for the policymakers to make impactful decisions. This
+thesis aims to segment the building and roads from the aerial image captured by
+the satellites and UAVs. Many different architectures have been proposed for
+the semantic segmentation task and UNet being one of them. In this thesis, we
+propose a novel architecture based on Google's newly proposed EfficientNetV2 as
+an encoder for feature extraction with UNet decoder for constructing the
+segmentation map. Using this approach we achieved a benchmark score for the
+Massachusetts Building and Road dataset with an mIOU of 0.8365 and 0.9153
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The transformer network analysis was not included in the current
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Steel Surface Roughness Parameter Calculations Using Lasers and Machine
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Milne, Xianghua Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Control of surface texture in strip steel is essential to meet customer
+requirements during galvanizing and temper rolling processes. Traditional
+methods rely on post-production stylus measurements, while on-line techniques
+offer non-contact and real-time measurements of the entire strip. However,
+ensuring accurate measurement is imperative for their effective utilization in
+the manufacturing pipeline. Moreover, accurate on-line measurements enable
+real-time adjustments of manufacturing processing parameters during production,
+ensuring consistent quality and the possibility of closed-loop control of the
+temper mill. In this study, we leverage state-of-the-art machine learning
+models to enhance the transformation of on-line measurements into significantly
+a more accurate Ra surface roughness metric. By comparing a selection of
+data-driven approaches, including both deep learning and non-deep learning
+methods, to the close-form transformation, we evaluate their potential for
+improving surface texture control in temper strip steel manufacturing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web automation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that learns from self-experience to
+complete tasks on real websites following natural language instructions.
+WebAgent plans ahead by decomposing instructions into canonical
+sub-instructions, summarizes long HTML documents into task-relevant snippets,
+and acts on websites via Python programs generated from those. We design
+WebAgent with Flan-U-PaLM, for grounded code generation, and HTML-T5, new
+pre-trained LLMs for long HTML documents using local and global attention
+mechanisms and a mixture of long-span denoising objectives, for planning and
+summarization. We empirically demonstrate that our modular recipe improves the
+success on real websites by over 50%, and that HTML-T5 is the best model to
+solve various HTML understanding tasks; achieving 18.7% higher success rate
+than the prior method on MiniWoB web automation benchmark, and SoTA performance
+on Mind2Web, an offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Web Navigation with Instruction-Finetuned Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroki Furuta, Kuang-Huei Lee, Ofir Nachum, Yutaka Matsuo, Aleksandra Faust, Shixiang Shane Gu, Izzeddin Gur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The progress of autonomous web navigation has been hindered by the dependence
+on billions of exploratory interactions via online reinforcement learning, and
+domain-specific model designs that make it difficult to leverage generalization
+from rich out-of-domain data. In this work, we study data-driven offline
+training for web agents with vision-language foundation models. We propose an
+instruction-following multimodal agent, WebGUM, that observes both webpage
+screenshots and HTML pages and outputs web navigation actions, such as click
+and type. WebGUM is trained by jointly finetuning an instruction-finetuned
+language model and a vision encoder with temporal and local perception on a
+large corpus of demonstrations. We empirically demonstrate this recipe improves
+the agent's ability of grounded multimodal perception, HTML comprehension, and
+multi-step reasoning, outperforming prior works by a significant margin. On the
+MiniWoB, we improve over the previous best offline methods by more than 45.8%,
+even outperforming online-finetuned SoTA, humans, and GPT-4-based agent. On the
+WebShop benchmark, our 3-billion-parameter model achieves superior performance
+to the existing SoTA, PaLM-540B. Furthermore, WebGUM exhibits strong positive
+transfer to the real-world planning tasks on the Mind2Web. We also collect 347K
+high-quality demonstrations using our trained models, 38 times larger than
+prior work, and make them available to promote future research in this
+direction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://sites.google.com/view/mm-webnav/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Travel in LLMs: Tracing Data Contamination in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahriar Golchin, Mihai Surdeanu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data contamination, i.e., the presence of test data from downstream tasks in
+the training data of large language models (LLMs), is a potential major issue
+in measuring LLMs' real effectiveness on other tasks. We propose a
+straightforward yet effective method for identifying data contamination within
+LLMs. At its core, our approach starts by identifying potential contamination
+at the instance level; using this information, our approach then assesses wider
+contamination at the partition level. To estimate contamination of individual
+instances, we employ "guided instruction:" a prompt consisting of the dataset
+name, partition type, and the random-length initial segment of a reference
+instance, asking the LLM to complete it. An instance is flagged as contaminated
+if the LLM's output either exactly or nearly matches the latter segment of the
+reference. To understand if an entire partition is contaminated, we propose two
+ideas. The first idea marks a dataset partition as contaminated if the average
+overlap score with the reference instances (as measured by ROUGE-L or BLEURT)
+is statistically significantly better with the completions from guided
+instruction compared to a "general instruction" that does not include the
+dataset and partition name. The second idea marks a dataset partition as
+contaminated if a classifier based on GPT-4 with few-shot in-context learning
+prompt marks multiple generated completions as exact/near-exact matches of the
+corresponding reference instances. Our best method achieves an accuracy between
+92% and 100% in detecting if an LLM is contaminated with seven datasets,
+containing train and test/validation partitions, when contrasted with manual
+evaluation by human experts. Further, our findings indicate that GPT-4 is
+contaminated with AG News, WNLI, and XSum datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2 preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Explorative Key-term Selection Strategies for Conversational
+  Contextual Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00315v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00315v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyong Wang, Xutong Liu, Shuai Li, John C. S. Lui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational contextual bandits elicit user preferences by occasionally
+querying for explicit feedback on key-terms to accelerate learning. However,
+there are aspects of existing approaches which limit their performance. First,
+information gained from key-term-level conversations and arm-level
+recommendations is not appropriately incorporated to speed up learning. Second,
+it is important to ask explorative key-terms to quickly elicit the user's
+potential interests in various domains to accelerate the convergence of user
+preference estimation, which has never been considered in existing works. To
+tackle these issues, we first propose ``ConLinUCB", a general framework for
+conversational bandits with better information incorporation, combining
+arm-level and key-term-level feedback to estimate user preference in one step
+at each time. Based on this framework, we further design two bandit algorithms
+with explorative key-term selection strategies, ConLinUCB-BS and ConLinUCB-MCR.
+We prove tighter regret upper bounds of our proposed algorithms. Particularly,
+ConLinUCB-BS achieves a regret bound of $O(d\sqrt{T\log T})$, better than the
+previous result $O(d\sqrt{T}\log T)$. Extensive experiments on synthetic and
+real-world data show significant advantages of our algorithms in learning
+accuracy (up to 54\% improvement) and computational efficiency (up to 72\%
+improvement), compared to the classic ConUCB algorithm, showing the potential
+benefit to recommender systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-contradictory Hallucinations of Large Language Models: Evaluation,
+  Detection and Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niels Mündler, Jingxuan He, Slobodan Jenko, Martin Vechev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (large LMs) are susceptible to producing text that
+contains hallucinated content. An important instance of this problem is
+self-contradiction, where the LM generates two contradictory sentences within
+the same context. In this work, we present a comprehensive investigation into
+self-contradiction for various instruction-tuned LMs, covering evaluation,
+detection, and mitigation. Our analysis reveals the prevalence of
+self-contradictions when LMs generate text for open-domain topics, e.g., in
+17.7% of all sentences produced by ChatGPT. Self-contradiction also complements
+retrieval-based methods, as a large portion of them (e.g., 35.8% for ChatGPT)
+cannot be verified using Wikipedia. We then propose a novel prompting-based
+framework designed to effectively detect and mitigate self-contradictions. Our
+detector achieves high accuracy, e.g., around 80% F1 score when prompting
+ChatGPT. The mitigation algorithm iteratively refines the generated text to
+remove contradictory information while preserving text fluency and
+informativeness. Importantly, our entire framework is applicable to black-box
+LMs and does not require external grounded knowledge. Our approach is
+practically effective and has been released as a push-button tool to benefit
+the public, available at https://chatprotect.ai/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HighLight: Efficient and Flexible DNN Acceleration with Hierarchical
+  Structured Sparsity <span class="chip">MICRO23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12718v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12718v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannan Nellie Wu, Po-An Tsai, Saurav Muralidharan, Angshuman Parashar, Vivienne Sze, Joel S. Emer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to complex interactions among various deep neural network (DNN)
+optimization techniques, modern DNNs can have weights and activations that are
+dense or sparse with diverse sparsity degrees. To offer a good trade-off
+between accuracy and hardware performance, an ideal DNN accelerator should have
+high flexibility to efficiently translate DNN sparsity into reductions in
+energy and/or latency without incurring significant complexity overhead.
+  This paper introduces hierarchical structured sparsity (HSS), with the key
+insight that we can systematically represent diverse sparsity degrees by having
+them hierarchically composed from multiple simple sparsity patterns. As a
+result, HSS simplifies the underlying hardware since it only needs to support
+simple sparsity patterns; this significantly reduces the sparsity acceleration
+overhead, which improves efficiency. Motivated by such opportunities, we
+propose a simultaneously efficient and flexible accelerator, named HighLight,
+to accelerate DNNs that have diverse sparsity degrees (including dense). Due to
+the flexibility of HSS, different HSS patterns can be introduced to DNNs to
+meet different applications' accuracy requirements. Compared to existing works,
+HighLight achieves a geomean of up to 6.4x better energy-delay product (EDP)
+across workloads with diverse sparsity degrees, and always sits on the
+EDP-accuracy Pareto frontier for representative DNNs
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICRO23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Multimodal Functional Graphical Model Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17237v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17237v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katherine Tsai, Boxin Zhao, Sanmi Koyejo, Mladen Kolar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Joint multimodal functional data acquisition, where functional data from
+multiple modes are measured simultaneously from the same subject, has emerged
+as an exciting modern approach enabled by recent engineering breakthroughs in
+the neurological and biological sciences. One prominent motivation to acquire
+such data is to enable new discoveries of the underlying connectivity by
+combining multimodal signals. Despite the scientific interest, there remains a
+gap in principled statistical methods for estimating the graph underlying
+multimodal functional data. To this end, we propose a new integrative framework
+that models the data generation process and identifies operators mapping from
+the observation space to the latent space. We then develop an estimator that
+simultaneously estimates the transformation operators and the latent graph.
+This estimator is based on the partial correlation operator, which we
+rigorously extend from the multivariate to the functional setting. Our
+procedure is provably efficient, with the estimator converging to a stationary
+point with quantifiable statistical error. Furthermore, we show recovery of the
+latent graph under mild conditions. Our work is applied to analyze
+simultaneously acquired multimodal brain imaging data where the graph indicates
+functional connectivity of the brain. We present simulation and empirical
+results that support the benefits of joint estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaptSim: Task-Driven Simulation Adaptation for Sim-to-Real Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04903v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04903v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allen Z. Ren, Hongkai Dai, Benjamin Burchfiel, Anirudha Majumdar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulation parameter settings such as contact models and object geometry
+approximations are critical to training robust robotic policies capable of
+transferring from simulation to real-world deployment. Previous approaches
+typically handcraft distributions over such parameters (domain randomization),
+or identify parameters that best match the dynamics of the real environment
+(system identification). However, there is often an irreducible gap between
+simulation and reality: attempting to match the dynamics between simulation and
+reality across all states and tasks may be infeasible and may not lead to
+policies that perform well in reality for a specific task. Addressing this
+issue, we propose AdaptSim, a new task-driven adaptation framework for
+sim-to-real transfer that aims to optimize task performance in target (real)
+environments -- instead of matching dynamics between simulation and reality.
+First, we meta-learn an adaptation policy in simulation using reinforcement
+learning for adjusting the simulation parameter distribution based on the
+current policy's performance in a target environment. We then perform iterative
+real-world adaptation by inferring new simulation parameter distributions for
+policy training, using a small amount of real data. We perform experiments in
+three robotic tasks: (1) swing-up of linearized double pendulum, (2) dynamic
+table-top pushing of a bottle, and (3) dynamic scooping of food pieces with a
+spatula. Our extensive simulation and hardware experiments demonstrate AdaptSim
+achieving 1-3x asymptotic performance and $\sim$2x real data efficiency when
+adapting to different environments, compared to methods based on Sys-ID and
+directly training the task policy in target environments. Website:
+https://irom-lab.github.io/AdaptSim/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning (CoRL), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Recommendations with <span class="highlight-title">Pre-Train</span>ed Large Language Models for
+  Multimodal Nudging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachel M. Harrison, Anton Dereventsov, Anton Bibin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for zero-shot recommendation of multimodal non-stationary
+content that leverages recent advancements in the field of generative AI. We
+propose rendering inputs of different modalities as textual descriptions and to
+utilize pre-trained LLMs to obtain their numerical representations by computing
+semantic embeddings. Once unified representations of all content items are
+obtained, the recommendation can be performed by computing an appropriate
+similarity metric between them without any additional learning. We demonstrate
+our approach on a synthetic multimodal nudging environment, where the inputs
+consist of tabular, textual, and visual data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Normalised clustering accuracy: An asymmetric external cluster validity
+  measure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.02935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.02935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marek Gagolewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is no, nor will there ever be, single best clustering algorithm, but we
+would still like to be able to distinguish between methods which work well on
+certain task types and those that systematically underperform. Clustering
+algorithms are traditionally evaluated using either internal or external
+validity measures. Internal measures quantify different aspects of the obtained
+partitions, e.g., the average degree of cluster compactness or point
+separability. Yet, their validity is questionable, because the clusterings they
+promote can sometimes be meaningless. External measures, on the other hand,
+compare the algorithms' outputs to the reference, ground truth groupings that
+are provided by experts. In this paper, we argue that the commonly-used
+classical partition similarity scores, such as the normalised mutual
+information, Fowlkes-Mallows, or adjusted Rand index, miss some desirable
+properties, e.g., they do not identify worst-case scenarios correctly or are
+not easily interpretable. This makes comparing clustering algorithms across
+many benchmark datasets difficult. To remedy these issues, we propose and
+analyse a new measure: a version of the optimal set-matching accuracy, which is
+normalised, monotonic, scale invariant, and corrected for the imbalancedness of
+cluster sizes (but neither symmetric nor adjusted for chance).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Human-generated Demonstrations Necessary for In-context Learning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Li, Guoyin Wang, Jiwei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the promising few-shot ability of large language models (LLMs), the
+standard paradigm of In-context Learning (ICL) suffers the disadvantages of
+susceptibility to selected demonstrations and the intricacy to generate these
+demonstrations. In this paper, we raise the fundamental question that whether
+human-generated demonstrations are necessary for ICL. To answer this question,
+we propose self-contemplation prompting strategy (SEC), a paradigm free from
+human-crafted demonstrations. The key point of SEC is that, instead of using
+hand-crafted examples as demonstrations in ICL, SEC asks LLMs to first create
+demonstrations on their own, based on which the final output is generated. SEC
+is a flexible framework and can be adapted to both the vanilla ICL and the
+chain-of-thought (CoT), but with greater ease: as the manual-generation process
+of both examples and rationale can be saved. Extensive experiments in
+arithmetic reasoning, commonsense reasoning, multi-task language understanding,
+and code generation benchmarks, show that SEC, which does not require
+hand-crafted demonstrations, significantly outperforms the zero-shot learning
+strategy, and achieves comparable results to ICL with hand-crafted
+demonstrations. This demonstrates that, for many tasks, contemporary LLMs
+possess a sufficient level of competence to exclusively depend on their own
+capacity for decision making, removing the need for external training data.
+Code is available at https://github.com/ruili33/SEC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MOTOR: A Time-To-Event Foundation Model For Structured Medical Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ethan Steinberg, Yizhe Xu, Jason Fries, Nigam Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a self-supervised, time-to-event (TTE) foundation model called
+MOTOR (Many Outcome Time Oriented Representations) which is pretrained on
+timestamped sequences of events in electronic health records (EHR) and health
+insurance claims. TTE models are used for estimating the probability
+distribution of the time until a specific event occurs, which is an important
+task in medical settings. TTE models provide many advantages over
+classification using fixed time horizons, including naturally handling censored
+observations, but are challenging to train with limited labeled data. MOTOR
+addresses this challenge by pretraining on up to 55M patient records (9B
+clinical events). We evaluate MOTOR's transfer learning performance on 19
+tasks, across 3 patient databases (a private EHR system, MIMIC-IV, and Merative
+claims data). Task-specific models adapted from MOTOR improve time-dependent C
+statistics by 4.6% over state-of-the-art, improve label efficiency by up to 95%
+,and are more robust to temporal distributional shifts. We further evaluate
+cross-site portability by adapting our MOTOR foundation model for six
+prediction tasks on the MIMIC-IV dataset, where it outperforms all baselines.
+MOTOR is the first foundation model for medical TTE predictions and we release
+a 143M parameter pretrained model for research use at [redacted URL].
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conductivity Imaging from Internal Measurements with Mixed Least-Squares
+  Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bangti Jin, Xiyao Li, Qimeng Quan, Zhi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we develop a novel approach using deep neural networks to
+reconstruct the conductivity distribution in elliptic problems from one
+measurement of the solution over the whole domain. The approach is based on a
+mixed reformulation of the governing equation and utilizes the standard
+least-squares objective, with deep neural networks as ansatz functions to
+approximate the conductivity and flux simultaneously. We provide a thorough
+analysis of the deep neural network approximations of the conductivity for both
+continuous and empirical losses, including rigorous error estimates that are
+explicit in terms of the noise level, various penalty parameters and neural
+network architectural parameters (depth, width and parameter bound). We also
+provide multiple numerical experiments in two- and multi-dimensions to
+illustrate distinct features of the approach, e.g., excellent stability with
+respect to data noise and capability of solving high-dimensional problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages. 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ mSAM: Micro-Batch-Averaged Sharpness-Aware Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kayhan Behdin, Qingquan Song, Aman Gupta, Sathiya Keerthi, Ayan Acharya, Borja Ocejo, Gregory Dexter, Rajiv Khanna, David Durfee, Rahul Mazumder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep learning models are over-parameterized, where different optima
+can result in widely varying generalization performance. The Sharpness-Aware
+Minimization (SAM) technique modifies the fundamental loss function that steers
+gradient descent methods toward flatter minima, which are believed to exhibit
+enhanced generalization prowess. Our study delves into a specific variant of
+SAM known as micro-batch SAM (mSAM). This variation involves aggregating
+updates derived from adversarial perturbations across multiple shards
+(micro-batches) of a mini-batch during training. We extend a recently developed
+and well-studied general framework for flatness analysis to theoretically show
+that SAM achieves flatter minima than SGD, and mSAM achieves even flatter
+minima than SAM. We provide a thorough empirical evaluation of various image
+classification and natural language processing tasks to substantiate this
+theoretical advancement. We also show that contrary to previous work, mSAM can
+be implemented in a flexible and parallelizable manner without significantly
+increasing computational costs. Our implementation of mSAM yields superior
+generalization performance across a wide range of tasks compared to SAM,
+further supporting our theoretical framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2212.04343</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Normality Learning-based Graph Anomaly Detection via Multi-Scale
+  Contrastive Learning <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingcan Duan, Pei Zhang, Siwei Wang, Jingtao Hu, Hu Jin, Jiaxin Zhang, Haifang Zhou, Xinwang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph anomaly detection (GAD) has attracted increasing attention in machine
+learning and data mining. Recent works have mainly focused on how to capture
+richer information to improve the quality of node embeddings for GAD. Despite
+their significant advances in detection performance, there is still a relative
+dearth of research on the properties of the task. GAD aims to discern the
+anomalies that deviate from most nodes. However, the model is prone to learn
+the pattern of normal samples which make up the majority of samples. Meanwhile,
+anomalies can be easily detected when their behaviors differ from normality.
+Therefore, the performance can be further improved by enhancing the ability to
+learn the normal pattern. To this end, we propose a normality learning-based
+GAD framework via multi-scale contrastive learning networks (NLGAD for
+abbreviation). Specifically, we first initialize the model with the contrastive
+networks on different scales. To provide sufficient and reliable normal nodes
+for normality learning, we design an effective hybrid strategy for normality
+selection. Finally, the model is refined with the only input of reliable normal
+nodes and learns a more accurate estimate of normality so that anomalous nodes
+can be more easily distinguished. Eventually, extensive experiments on six
+benchmark graph datasets demonstrate the effectiveness of our normality
+learning-based scheme on GAD. Notably, the proposed algorithm improves the
+detection performance (up to 5.89% AUC gain) compared with the state-of-the-art
+methods. The source code is released at https://github.com/FelixDJC/NLGAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ARISE: Graph Anomaly Detection on Attributed Networks via Substructure
+  Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15255v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15255v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingcan Duan, Bin Xiao, Siwei Wang, Haifang Zhou, Xinwang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, graph anomaly detection on attributed networks has attracted
+growing attention in data mining and machine learning communities. Apart from
+attribute anomalies, graph anomaly detection also aims at suspicious
+topological-abnormal nodes that exhibit collective anomalous behavior. Closely
+connected uncorrelated node groups form uncommonly dense substructures in the
+network. However, existing methods overlook that the topology anomaly detection
+performance can be improved by recognizing such a collective pattern. To this
+end, we propose a new graph anomaly detection framework on attributed networks
+via substructure awareness (ARISE for abbreviation). Unlike previous
+algorithms, we focus on the substructures in the graph to discern
+abnormalities. Specifically, we establish a region proposal module to discover
+high-density substructures in the network as suspicious regions. The average
+node-pair similarity can be regarded as the topology anomaly degree of nodes
+within substructures. Generally, the lower the similarity, the higher the
+probability that internal nodes are topology anomalies. To distill better
+embeddings of node attributes, we further introduce a graph contrastive
+learning scheme, which observes attribute anomalies in the meantime. In this
+way, ARISE can detect both topology and attribute anomalies. Ultimately,
+extensive experiments on benchmark datasets show that ARISE greatly improves
+detection performance (up to 7.30% AUC and 17.46% AUPRC gains) compared to
+state-of-the-art attributed networks anomaly detection (ANAD) algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures, accepted by IEEE TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ State-space Models with Layer-wise Nonlinearity are Universal
+  Approximators with Exponential Decaying Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13414v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13414v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shida Wang, Beichen Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-space models have gained popularity in sequence modelling due to their
+simple and efficient network structures. However, the absence of nonlinear
+activation along the temporal direction limits the model's capacity. In this
+paper, we prove that stacking state-space models with layer-wise nonlinear
+activation is sufficient to approximate any continuous sequence-to-sequence
+relationship. Our findings demonstrate that the addition of layer-wise
+nonlinear activation enhances the model's capacity to learn complex sequence
+patterns. Meanwhile, it can be seen both theoretically and empirically that the
+state-space models do not fundamentally resolve the exponential decaying memory
+issue. Theoretical results are justified by numerical verifications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">review</span> of ensemble learning and data augmentation models for class
+  imbalanced problems: combination, implementation and evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02858v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02858v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azal Ahmad Khan, Omkar Chaudhari, Rohitash Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class imbalance (CI) in classification problems arises when the number of
+observations belonging to one class is lower than the other. Ensemble learning
+combines multiple models to obtain a robust model and has been prominently used
+with data augmentation methods to address class imbalance problems. In the last
+decade, a number of strategies have been added to enhance ensemble learning and
+data augmentation methods, along with new methods such as generative
+adversarial networks (GANs). A combination of these has been applied in many
+studies, but the true rank of different combinations would require a
+computational review.
+  In this paper, we present a computational review to evaluate data
+augmentation and ensemble learning methods used to address prominent benchmark
+CI problems. We present a general framework that evaluates 10 data augmentation
+and 10 ensemble learning methods for CI problems. Our objective is to identify
+the most effective combination for improving classification performance on
+imbalanced datasets. The results indicate that combinations of data
+augmentation methods with ensemble learning can significantly improve
+classification performance on imbalanced datasets. Our study is vital for the
+development of novel models for handling imbalanced datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Superiority of GNN over NN in generalizing bandlimited functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.05904v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.05904v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Martina Neuman, Rongrong Wang, Yuying Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have emerged as formidable resources for
+processing graph-based information across diverse applications. While the
+expressive power of GNNs has traditionally been examined in the context of
+graph-level tasks, their potential for node-level tasks, such as node
+classification, where the goal is to interpolate missing node labels from the
+observed ones, remains relatively unexplored. In this study, we investigate the
+proficiency of GNNs for such classifications, which can also be cast as a
+function interpolation problem. Explicitly, we focus on ascertaining the
+optimal configuration of weights and layers required for a GNN to successfully
+interpolate a band-limited function over Euclidean cubes. Our findings
+highlight a pronounced efficiency in utilizing GNNs to generalize a bandlimited
+function within an $\varepsilon$-error margin. Remarkably, achieving this task
+necessitates only $O_d((\log\varepsilon^{-1})^d)$ weights and
+$O_d((\log\varepsilon^{-1})^d)$ training samples. We explore how this criterion
+stacks up against the explicit constructions of currently available Neural
+Networks (NNs) designed for similar tasks. Significantly, our result is
+obtained by drawing an innovative connection between the GNN structures and
+classical sampling theorems. In essence, our pioneering work marks a meaningful
+contribution to the research domain, advancing our understanding of the
+practical GNN applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-dimensional Bayesian Optimization via Semi-supervised Learning with
+  Optimized Unlabeled Data Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Yin, Yu Wang, Peng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) is a powerful sequential optimization approach for
+seeking the global optimum of black-box functions for sample efficiency
+purposes. Evaluations of black-box functions can be expensive, rendering
+reduced use of labeled data desirable. For the first time, we introduce a
+teacher-student model, called $\texttt{TSBO}$, to enable semi-supervised
+learning that can make use of large amounts of cheaply generated unlabeled data
+under the context of BO to enhance the generalization of data query models. Our
+teacher-student model is uncertainty-aware and offers a practical mechanism for
+leveraging the pseudo labels generated for unlabeled data while dealing with
+the involved risk. We show that the selection of unlabeled data is key to
+$\texttt{TSBO}$. We optimize unlabeled data sampling by generating unlabeled
+data from a dynamically fitted extreme value distribution or a parameterized
+sampling distribution learned by minimizing the student feedback.
+$\texttt{TSBO}$ is capable of operating in a learned latent space with reduced
+dimensionality, providing scalability to high-dimensional problems.
+$\texttt{TSBO}$ demonstrates the significant sample efficiency in several
+global optimization tasks under tight labeled data budgets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Web Image Formats: Assessment of Their Real-World-Usage and Performance
+  across Popular Web Browsers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benedikt Dornauer, Michael Felderer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 2023, images on the web make up 41% of transmitted data, significantly
+impacting the performance of web apps. Fortunately, image formats like WEBP and
+AVIF could offer advanced compression and faster page loading, but may face
+performance disparities across browsers. Therefore, we conducted performance
+evaluations on five major browsers - Chrome, Edge, Safari, Opera, and Firefox -
+while comparing four image formats. The results indicate that the newer formats
+exhibited notable performance enhancements across all browsers, leading to
+shorter loading times. Compared to the compressed JPEG format, WEBP and AVIF
+improved the Page Load Time by 21% and 15%, respectively. However, web scraping
+revealed that JPEG and PNG still dominate web image choices, with WEBP at 4% as
+the most used new format. Through the web scraping and web performance
+evaluation, this research serves to (1) explore image format preferences in web
+applications and analyze distribution and characteristics across
+frequently-visited sites in 2023 and (2) assess the performance impact of
+distinct web image formats on application load times across popular web
+browsers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint: Product-Focused Software Process Improvement 24th
+  International Conference, PROFES 2023, Dornbirn, Austria , Dezember 10-13,
+  2023, Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Task Performance: Evaluating and Reducing the Flaws of Large
+  Multimodal Models with In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Shukor, Alexandre Rame, Corentin Dancette, Matthieu Cord
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the success of Large Language Models (LLMs), Large Multimodal
+Models (LMMs), such as the Flamingo model and its subsequent competitors, have
+started to emerge as natural steps towards generalist agents. However,
+interacting with recent LMMs reveals major limitations that are hardly captured
+by the current evaluation benchmarks. Indeed, task performances (e.g., VQA
+accuracy) alone do not provide enough clues to understand their real
+capabilities, limitations, and to which extent such models are aligned to human
+expectations. To refine our understanding of those flaws, we deviate from the
+current evaluation paradigm and propose the EvALign-ICL framework, in which we
+(1) evaluate 8 recent open-source LMMs (based on the Flamingo architecture such
+as OpenFlamingo and IDEFICS) on 5 different axes; hallucinations, abstention,
+compositionality, explainability and instruction following. Our evaluation on
+these axes reveals major flaws in LMMs. To efficiently address these problems,
+and inspired by the success of in-context learning (ICL) in LLMs, (2) we
+explore ICL as a solution and study how it affects these limitations. Based on
+our ICL study, (3) we push ICL further and propose new multimodal ICL
+approaches such as; Multitask-ICL, Chain-of-Hindsight-ICL, and
+Self-Correcting-ICL. Our findings are as follows; (1) Despite their success,
+LMMs have flaws that remain unsolved with scaling alone. (2) The effect of ICL
+on LMMs flaws is nuanced; despite its effectiveness for improved
+explainability, abstention, and instruction following, ICL does not improve
+compositional abilities, and actually even amplifies hallucinations. (3) The
+proposed ICL variants are promising as post-hoc approaches to efficiently
+tackle some of those flaws. The code is available here:
+https://evalign-icl.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://evalign-icl.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CPIPS: Learning to Preserve Perceptual Distances in End-to-End Image
+  Compression <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Hsiu Huang, Ja-Ling Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lossy image coding standards such as JPEG and MPEG have successfully achieved
+high compression rates for human consumption of multimedia data. However, with
+the increasing prevalence of IoT devices, drones, and self-driving cars,
+machines rather than humans are processing a greater portion of captured visual
+content. Consequently, it is crucial to pursue an efficient compressed
+representation that caters not only to human vision but also to image
+processing and machine vision tasks. Drawing inspiration from the efficient
+coding hypothesis in biological systems and the modeling of the sensory cortex
+in neural science, we repurpose the compressed latent representation to
+prioritize semantic relevance while preserving perceptual distance. Our
+proposed method, Compressed Perceptual Image Patch Similarity (CPIPS), can be
+derived at a minimal cost from a learned neural codec and computed
+significantly faster than DNN-based perceptual metrics such as LPIPS and DISTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures; accepted by APSIPA ASC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with
+  Diffusion Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17550v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17550v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenpeng Du, Qi Chen, Tianyu He, Xu Tan, Xie Chen, Kai Yu, Sheng Zhao, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent research has made significant progress in speech-driven talking
+face generation, the quality of the generated video still lags behind that of
+real recordings. One reason for this is the use of handcrafted intermediate
+representations like facial landmarks and 3DMM coefficients, which are designed
+based on human knowledge and are insufficient to precisely describe facial
+movements. Additionally, these methods require an external pretrained model for
+extracting these representations, whose performance sets an upper bound on
+talking face generation. To address these limitations, we propose a novel
+method called DAE-Talker that leverages data-driven latent representations
+obtained from a diffusion autoencoder (DAE). DAE contains an image encoder that
+encodes an image into a latent vector and a DDIM image decoder that
+reconstructs the image from it. We train our DAE on talking face video frames
+and then extract their latent representations as the training target for a
+Conformer-based speech2latent model. This allows DAE-Talker to synthesize full
+video frames and produce natural head movements that align with the content of
+speech, rather than relying on a predetermined head pose from a template video.
+We also introduce pose modelling in speech2latent for pose controllability.
+Additionally, we propose a novel method for generating continuous video frames
+with the DDIM image decoder trained on individual frames, eliminating the need
+for modelling the joint distribution of consecutive frames directly. Our
+experiments show that DAE-Talker outperforms existing popular methods in
+lip-sync, video fidelity, and pose naturalness. We also conduct ablation
+studies to analyze the effectiveness of the proposed techniques and demonstrate
+the pose controllability of DAE-Talker.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Recommendations with <span class="highlight-title">Pre-Train</span>ed Large Language Models for
+  Multimodal Nudging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachel M. Harrison, Anton Dereventsov, Anton Bibin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for zero-shot recommendation of multimodal non-stationary
+content that leverages recent advancements in the field of generative AI. We
+propose rendering inputs of different modalities as textual descriptions and to
+utilize pre-trained LLMs to obtain their numerical representations by computing
+semantic embeddings. Once unified representations of all content items are
+obtained, the recommendation can be performed by computing an appropriate
+similarity metric between them without any additional learning. We demonstrate
+our approach on a synthetic multimodal nudging environment, where the inputs
+consist of tabular, textual, and visual data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-09-30T00:00:00Z">2023-09-30</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Instruction-Following Robustness of Large Language Models
+  to <span class="highlight-title">Prompt</span> Injection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Li, Baolin Peng, Pengcheng He, Xifeng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown remarkable proficiency in following
+instructions, making them valuable in customer-facing applications. However,
+their impressive capabilities also raise concerns about the amplification of
+risks posed by adversarial instructions, which can be injected into the model
+input by third-party attackers to manipulate LLMs' original instructions and
+prompt unintended actions and content. Therefore, it is crucial to understand
+LLMs' ability to accurately discern which instructions to follow to ensure
+their safe deployment in real-world scenarios. In this paper, we propose a
+pioneering benchmark for automatically evaluating the robustness of
+instruction-following LLMs against adversarial instructions injected in the
+prompt. The objective of this benchmark is to quantify the extent to which LLMs
+are influenced by injected adversarial instructions and assess their ability to
+differentiate between these injected adversarial instructions and original user
+instructions. Through experiments conducted with state-of-the-art
+instruction-following LLMs, we uncover significant limitations in their
+robustness against adversarial instruction injection attacks. Furthermore, our
+findings indicate that prevalent instruction-tuned models are prone to being
+``overfitted'' to follow any instruction phrase in the prompt without truly
+understanding which instructions should be followed. This highlights the need
+to address the challenge of training models to comprehend prompts instead of
+merely following instruction phrases and completing the text. The data and code
+can be found at \url{https://github.com/Leezekun/Adv-Instruct-Eval}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The data and code can be found at
+  https://github.com/Leezekun/Adv-Instruct-Eval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Impact of Training Data Distribution and Subword
+  Tokenization on Gender Bias in Machine Translation <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12491v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12491v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bar Iluz, Tomasz Limisiewicz, Gabriel Stanovsky, David Mareček
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the effect of tokenization on gender bias in machine translation, an
+aspect that has been largely overlooked in previous works. Specifically, we
+focus on the interactions between the frequency of gendered profession names in
+training data, their representation in the subword tokenizer's vocabulary, and
+gender bias. We observe that female and non-stereotypical gender inflections of
+profession names (e.g., Spanish "doctora" for "female doctor") tend to be split
+into multiple subword tokens. Our results indicate that the imbalance of gender
+forms in the model's training corpus is a major factor contributing to gender
+bias and has a greater impact than subword splitting. We show that analyzing
+subword splits provides good estimates of gender-form imbalance in the training
+data and can be used even when the corpus is not publicly available. We also
+demonstrate that fine-tuning just the token embedding layer can decrease the
+gap in gender prediction accuracy between female and male forms without
+impairing the translation quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skill Check: Some Considerations on the Evaluation of Gamemastering
+  Models for Role-playing Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Santiago Góngora, Luis Chiruzzo, Gonzalo Méndez, Pablo Gervás
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In role-playing games a Game Master (GM) is the player in charge of the game,
+who must design the challenges the players face and narrate the outcomes of
+their actions. In this work we discuss some challenges to model GMs from an
+Interactive Storytelling and Natural Language Processing perspective. Following
+those challenges we propose three test categories to evaluate such dialogue
+systems, and we use them to test ChatGPT, Bard and OpenAssistant as
+out-of-the-box GMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages. Accepted at GALA 2023 (Games and Learning Alliance 12th
+  International Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Design of Chain-of-Thought in Math Problem Solving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11054v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11054v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanming Jie, Trung Quoc Luong, Xinbo Zhang, Xiaoran Jin, Hang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) plays a crucial role in reasoning for math problem
+solving. We conduct a comprehensive examination of methods for designing CoT,
+comparing conventional natural language CoT with various program CoTs,
+including the self-describing program, the comment-describing program, and the
+non-describing program. Furthermore, we investigate the impact of programming
+language on program CoTs, comparing Python and Wolfram Language. Through
+extensive experiments on GSM8K, MATHQA, and SVAMP, we find that program CoTs
+often have superior effectiveness in math problem solving. Notably, the best
+performing combination with 30B parameters beats GPT-3.5-turbo by a significant
+margin. The results show that self-describing program offers greater diversity
+and thus can generally achieve higher performance. We also find that Python is
+a better choice of language than Wolfram for program CoTs. The experimental
+results provide a valuable guideline for future CoT designs that take into
+account both programming language and coding style for further advancements.
+Our datasets and code are publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STAR: Improving Low-Resource Information Extraction by Structure-to-Text
+  Data Generation with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15090v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15090v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Derek Ma, Xiaoxuan Wang, Po-Nien Kung, P. Jeffrey Brantingham, Nanyun Peng, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information extraction tasks such as event extraction require an in-depth
+understanding of the output structure and sub-task dependencies. They heavily
+rely on task-specific training data in the form of (passage, target structure)
+pairs to obtain reasonable performance. However, obtaining such data through
+human annotation is costly, leading to a pressing need for low-resource
+information extraction approaches that require minimal human labeling for
+real-world applications. Fine-tuning supervised models with synthesized
+training data would be a generalizable method, but the existing data generation
+methods either still rely on large-scale ground-truth data or cannot be applied
+to complicated IE tasks due to their poor performance. To address these
+challenges, we propose STAR, a data generation method that leverages Large
+Language Models (LLMs) to synthesize data instances given limited seed
+demonstrations, thereby boosting low-resource information extraction
+performance. Our approach involves generating target structures (Y) followed by
+generating passages (X), all accomplished with the aid of LLMs. We design
+fine-grained step-by-step instructions to obtain the initial data instances. We
+further reduce errors and improve data quality through self-reflection error
+identification and self-refinement with iterative revision. Our experiments
+show that the data generated by STAR significantly improves the performance of
+low-resource event extraction and relation extraction tasks, even surpassing
+the effectiveness of human-curated data. Human assessment of the data quality
+shows STAR-generated data exhibits higher passage quality and better align with
+the task definitions compared with the human-curated data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CRITIC: Large Language Models Can Self-Correct with Tool-Interactive
+  Critiquing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibin Gou, Zhihong Shao, Yeyun Gong, Yelong Shen, Yujiu Yang, Nan Duan, Weizhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in large language models (LLMs) have been impressive.
+However, these models sometimes show inconsistencies and problematic behavior,
+such as hallucinating facts, generating flawed code, or creating offensive and
+toxic content. Unlike these models, humans typically utilize external tools to
+cross-check and refine their initial content, like using a search engine for
+fact-checking, or a code interpreter for debugging. Inspired by this
+observation, we introduce a framework called CRITIC that allows LLMs, which are
+essentially "black boxes" to validate and progressively amend their own outputs
+in a manner similar to human interaction with tools. More specifically,
+starting with an initial output, CRITIC interacts with appropriate tools to
+evaluate certain aspects of the text, and then revises the output based on the
+feedback obtained during this validation process. Comprehensive evaluations
+involving free-form question answering, mathematical program synthesis, and
+toxicity reduction demonstrate that CRITIC consistently enhances the
+performance of LLMs. Meanwhile, our research highlights the crucial importance
+of external feedback in promoting the ongoing self-improvement of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>add LLaMA-2 7B to 70B results; add more mathematical program
+  synthesis datasets</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantically Aligned Task Decomposition in Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Li, Dan Qiao, Baoxiang Wang, Xiangfeng Wang, Bo Jin, Hongyuan Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The difficulty of appropriately assigning credit is particularly heightened
+in cooperative MARL with sparse reward, due to the concurrent time and
+structural scales involved. Automatic subgoal generation (ASG) has recently
+emerged as a viable MARL approach inspired by utilizing subgoals in
+intrinsically motivated reinforcement learning. However, end-to-end learning of
+complex task planning from sparse rewards without prior knowledge, undoubtedly
+requires massive training samples. Moreover, the diversity-promoting nature of
+existing ASG methods can lead to the "over-representation" of subgoals,
+generating numerous spurious subgoals of limited relevance to the actual task
+reward and thus decreasing the sample efficiency of the algorithm. To address
+this problem and inspired by the disentangled representation learning, we
+propose a novel "disentangled" decision-making method, Semantically Aligned
+task decomposition in MARL (SAMA), that prompts pretrained language models with
+chain-of-thought that can suggest potential goals, provide suitable goal
+decomposition and subgoal allocation as well as self-reflection-based
+replanning. Additionally, SAMA incorporates language-grounded RL to train each
+agent's subgoal-conditioned policy. SAMA demonstrates considerable advantages
+in sample efficiency compared to state-of-the-art ASG methods, as evidenced by
+its performance on two challenging sparse-reward tasks, Overcooked and MiniRTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>-Based Length Controlled Generation with Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renlong Jie, Xiaojun Meng, Lifeng Shang, Xin Jiang, Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) like ChatGPT and GPT-4 have attracted great
+attention given their surprising performance on a wide range of NLP tasks.
+Length controlled generation of LLMs emerges as an important topic, which
+enables users to fully leverage the capability of LLMs in more real-world
+scenarios like generating a proper answer or essay of a desired length. In
+addition, the autoregressive generation in LLMs is extremely time-consuming,
+while the ability of controlling this generated length can reduce the inference
+cost by limiting the length. Therefore, we propose a prompt-based length
+control method to achieve high-accuracy length controlled generation. In
+particular, we adopt reinforcement learning with the reward signal given by
+either trainable or rule-based reward models, which further enhances the
+length-control ability of LLMs by rewarding outputs that follows pre-defined
+control instruction. To enable rule-based inference, we also introduce standard
+prompt extractor to collect the standard control information from users' input.
+Experiments show that our method significantly improves the accuracy of
+prompt-based length control for summarization task on popular datasets like
+CNNDM and NYT. Both the standard prompt extractor and the RL-tuned model have
+show strong generalization ability to unseen control prompt templates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ At Which Training Stage Does Code Data Help LLMs Reasoning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16298v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16298v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingwei Ma, Yue Liu, Yue Yu, Yuanliang Zhang, Yu Jiang, Changjian Wang, Shanshan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have exhibited remarkable reasoning capabilities
+and become the foundation of language technologies. Inspired by the great
+success of code data in training LLMs, we naturally wonder at which training
+stage introducing code data can really help LLMs reasoning. To this end, this
+paper systematically explores the impact of code data on LLMs at different
+stages. Concretely, we introduce the code data at the pre-training stage,
+instruction-tuning stage, and both of them, respectively. Then, the reasoning
+capability of LLMs is comprehensively and fairly evaluated via six reasoning
+tasks in five domains. We critically analyze the experimental results and
+provide conclusions with insights. First, pre-training LLMs with the mixture of
+code and text can significantly enhance LLMs' general reasoning capability
+almost without negative transfer on other tasks. Besides, at the
+instruction-tuning stage, code data endows LLMs the task-specific reasoning
+capability. Moreover, the dynamic mixing strategy of code and text data assists
+LLMs to learn reasoning capability step-by-step during training. These insights
+deepen the understanding of LLMs regarding reasoning ability for their
+application, such as scientific question answering, legal support, etc. The
+source code and model parameters are released at the
+link:~\url{https://github.com/yingweima2022/CodeLLM}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Think-on-Graph: Deep and Responsible Reasoning of Large Language Model
+  on Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07697v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07697v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashuo Sun, Chengjin Xu, Lumingyuan Tang, Saizhuo Wang, Chen Lin, Yeyun Gong, Lionel M. Ni, Heung-Yeung Shum, Jian Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although large language models (LLMs) have achieved significant success in
+various tasks, they often struggle with hallucination problems, especially in
+scenarios requiring deep and responsible reasoning. These issues could be
+partially addressed by introducing external knowledge graphs (KG) in LLM
+reasoning. In this paper, we propose a new LLM-KG integrating paradigm
+``$\hbox{LLM}\otimes\hbox{KG}$'' which treats the LLM as an agent to
+interactively explore related entities and relations on KGs and perform
+reasoning based on the retrieved knowledge. We further implement this paradigm
+by introducing a new approach called Think-on-Graph (ToG), in which the LLM
+agent iteratively executes beam search on KG, discovers the most promising
+reasoning paths, and returns the most likely reasoning results. We use a number
+of well-designed experiments to examine and illustrate the following advantages
+of ToG: 1) compared with LLMs, ToG has better deep reasoning power; 2) ToG has
+the ability of knowledge traceability and knowledge correctability by
+leveraging LLMs reasoning and expert feedback; 3) ToG provides a flexible
+plug-and-play framework for different LLMs, KGs and prompting strategies
+without any additional training cost; 4) the performance of ToG with small LLM
+models could exceed large LLM such as GPT-4 in certain scenarios and this
+reduces the cost of LLM deployment and application. As a training-free method
+with lower computational cost and better generality, ToG achieves overall SOTA
+in 6 out of 9 datasets where most previous SOTAs rely on additional training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 13 figures, 20 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Applying Bio<span class="highlight-title">BERT</span> to Extract Germline Gene-Disease Associations for
+  Building a Knowledge Graph from the Biomedical Literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armando D. Diaz Gonzalez, Songhui Yue, Sean T. Hayes, Kevin S. Hughes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Published biomedical information has and continues to rapidly increase. The
+recent advancements in Natural Language Processing (NLP), have generated
+considerable interest in automating the extraction, normalization, and
+representation of biomedical knowledge about entities such as genes and
+diseases. Our study analyzes germline abstracts in the construction of
+knowledge graphs of the of the immense work that has been done in this area for
+genes and diseases. This paper presents SimpleGermKG, an automatic knowledge
+graph construction approach that connects germline genes and diseases. For the
+extraction of genes and diseases, we employ BioBERT, a pre-trained BERT model
+on biomedical corpora. We propose an ontology-based and rule-based algorithm to
+standardize and disambiguate medical terms. For semantic relationships between
+articles, genes, and diseases, we implemented a part-whole relation approach to
+connect each entity with its data source and visualize them in a graph-based
+knowledge representation. Lastly, we discuss the knowledge graph applications,
+limitations, and challenges to inspire the future research of germline corpora.
+Our knowledge graph contains 297 genes, 130 diseases, and 46,747 triples.
+Graph-based visualizations are used to show the results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlpaGasus: Training A Better Alpaca with Fewer Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lichang Chen, Shiyang Li, Jun Yan, Hai Wang, Kalpa Gunaratna, Vikas Yadav, Zheng Tang, Vijay Srinivasan, Tianyi Zhou, Heng Huang, Hongxia Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models~(LLMs) strengthen instruction-following capability
+through instruction-finetuning (IFT) on supervised instruction/response data.
+However, widely used IFT datasets (e.g., Alpaca's 52k data) surprisingly
+contain many low-quality instances with incorrect or irrelevant responses,
+which are misleading and detrimental to IFT. In this paper, we propose a simple
+and effective data selection strategy that automatically identifies and filters
+out low-quality data using a strong LLM (e.g., ChatGPT). To this end, we
+introduce AlpaGasus, which is finetuned on only 9k high-quality data filtered
+from the 52k Alpaca data. AlpaGasus significantly outperforms the original
+Alpaca as evaluated by GPT-4 on multiple test sets and the controlled human
+evaluation. Its 13B variant matches $>90\%$ performance of its teacher LLM
+(i.e., Text-Davinci-003 generating the 52k data) on test tasks. It also
+provides 5.7x faster training, reducing the training time for a 7B variant from
+80 minutes (for Alpaca) to 14 minutes. Moreover, the experiments prove the
+efficacy of our method across diverse datasets, base models, and LLM filters.
+Overall, AlpaGasus demonstrates a novel data-centric IFT paradigm that can be
+generally applied to instruction-tuning data, leading to faster training and
+better instruction-following models. Our project page is available at:
+\url{https://lichang-chen.github.io/AlpaGasus/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 Pages; 29 Figures; 15 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised ASR via Cross-Lingual Pseudo-Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatiana Likhomanenko, Loren Lugosch, Ronan Collobert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that it is possible to train an $\textit{unsupervised}$
+automatic speech recognition (ASR) system using only unpaired audio and text.
+Existing unsupervised ASR methods assume that no labeled data can be used for
+training. We argue that even if one does not have any labeled audio for a given
+language, there is $\textit{always}$ labeled data available for other
+languages. We show that it is possible to use character-level acoustic models
+(AMs) from other languages to bootstrap an $\textit{unsupervised}$ AM in a new
+language. Here, "unsupervised" means no labeled audio is available for the
+$\textit{target}$ language. Our approach is based on two key ingredients: (i)
+generating pseudo-labels (PLs) of the $\textit{target}$ language using some
+$\textit{other}$ language AM and (ii) constraining these PLs with a
+$\textit{target language model}$. Our approach is effective on Common Voice:
+e.g. transfer of English AM to Swahili achieves 18% WER. It also outperforms
+character-based wav2vec-U 2.0 by 15% absolute WER on LJSpeech with 800h of
+labeled German data instead of 60k hours of unlabeled English data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LMSYS-Chat-1M: A Large-Scale Real-World LLM Conversation <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11998v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11998v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Tianle Li, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zhuohan Li, Zi Lin, Eric. P Xing, Joseph E. Gonzalez, Ion Stoica, Hao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Studying how people interact with large language models (LLMs) in real-world
+scenarios is increasingly important due to their widespread use in various
+applications. In this paper, we introduce LMSYS-Chat-1M, a large-scale dataset
+containing one million real-world conversations with 25 state-of-the-art LLMs.
+This dataset is collected from 210K unique IP addresses in the wild on our
+Vicuna demo and Chatbot Arena website. We offer an overview of the dataset's
+content, including its curation process, basic statistics, and topic
+distribution, highlighting its diversity, originality, and scale. We
+demonstrate its versatility through four use cases: developing content
+moderation models that perform similarly to GPT-4, building a safety benchmark,
+training instruction-following models that perform similarly to Vicuna, and
+creating challenging benchmark questions. We believe that this dataset will
+serve as a valuable resource for understanding and advancing LLM capabilities.
+The dataset is publicly available at
+https://huggingface.co/datasets/lmsys/lmsys-chat-1m.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">30</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convolutional Neural Networks Rarely Learn Shape for Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Zhang, Maciej A. Mazurowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape learning, or the ability to leverage shape information, could be a
+desirable property of convolutional neural networks (CNNs) when target objects
+have specific shapes. While some research on the topic is emerging, there is no
+systematic study to conclusively determine whether and under what circumstances
+CNNs learn shape. Here, we present such a study in the context of segmentation
+networks where shapes are particularly important. We define shape and propose a
+new behavioral metric to measure the extent to which a CNN utilizes shape
+information. We then execute a set of experiments with synthetic and real-world
+data to progressively uncover under which circumstances CNNs learn shape and
+what can be done to encourage such behavior. We conclude that (i) CNNs do not
+learn shape in typical settings but rather rely on other features available to
+identify the objects of interest, (ii) CNNs can learn shape, but only if the
+shape is the only feature available to identify the object, (iii) sufficiently
+large receptive field size relative to the size of target objects is necessary
+for shape learning; (iv) a limited set of augmentations can encourage shape
+learning; (v) learning shape is indeed useful in the presence of
+out-of-distribution data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation Metrics for DNNs Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abanoub Ghobrial, Samuel Budgett, Dieter Balemans, Hamid Asgari, Phil Reiter, Kerstin Eder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a lot of ongoing research effort into developing different
+techniques for neural networks compression. However, the community lacks
+standardised evaluation metrics, which are key to identifying the most suitable
+compression technique for different applications. This paper reviews existing
+neural network compression evaluation metrics and implements them into a
+standardisation framework called NetZIP. We introduce two novel metrics to
+cover existing gaps of evaluation in the literature: 1) Compression and
+Hardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success
+(OCS). We demonstrate the use of NetZIP using two case studies on two different
+hardware platforms (a PC and a Raspberry Pi 4) focusing on object
+classification and object detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language models are good pathologists: using attention-based sequence
+  reduction and text-<span class="highlight-title">pretrain</span>ed <span class="highlight-title">transformer</span>s for efficient WSI classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07384v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07384v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan I. Pisula, Katarzyna Bozek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In digital pathology, Whole Slide Image (WSI) analysis is usually formulated
+as a Multiple Instance Learning (MIL) problem. Although transformer-based
+architectures have been used for WSI classification, these methods require
+modifications to adapt them to specific challenges of this type of image data.
+Among these challenges is the amount of memory and compute required by deep
+transformer models to process long inputs, such as the thousands of image
+patches that can compose a WSI at $\times 10$ or $\times 20$ magnification. We
+introduce \textit{SeqShort}, a multi-head attention-based sequence shortening
+layer to summarize each WSI in a fixed- and short-sized sequence of instances,
+that allows us to reduce the computational costs of self-attention on long
+sequences, and to include positional information that is unavailable in other
+MIL approaches. Furthermore, we show that WSI classification performance can be
+improved when the downstream transformer architecture has been pre-trained on a
+large corpus of text data, and only fine-tuning less than 0.1\% of its
+parameters. We demonstrate the effectiveness of our method in lymph node
+metastases classification and cancer subtype classification tasks, without the
+need of designing a WSI-specific transformer nor doing in-domain pre-training,
+keeping a reduced compute budget and low number of trainable parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>ing Language-Informed Distribution for Compositional Zero-Shot
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14428v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14428v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Bao, Lichang Chen, Heng Huang, Yu Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional zero-shot learning (CZSL) task aims to recognize unseen
+compositional visual concepts, e.g., sliced tomatoes, where the model is
+learned only from the seen compositions, e.g., sliced potatoes and red
+tomatoes. Thanks to the prompt tuning on large pre-trained visual language
+models such as CLIP, recent literature shows impressively better CZSL
+performance than traditional vision-based methods. However, the key aspects
+that impact the generalization to unseen compositions, including the diversity
+and informativeness of class context, and the entanglement between visual
+primitives, i.e., state and object, are not properly addressed in existing
+CLIP-based CZSL literature. In this paper, we propose a model by prompting the
+language-informed distribution, aka., PLID, for the CZSL task. Specifically,
+the PLID leverages pre-trained large language models (LLM) to 1) formulate the
+language-informed class distributions which are diverse and informative, and 2)
+enhance the compositionality of the class embedding. Moreover, a
+visual-language primitive decomposition (VLPD) module and a stochastic logit
+mixup (SLM) strategy are proposed to dynamically fuse the decisions from the
+compositional and the primitive logit space. Orthogonal to the existing
+literature of soft, hard, or distributional prompts, our method advocates
+prompting the LLM-supported class distribution that leads to a better zero-shot
+generalization. Experimental results on MIT-States, UT-Zappos, and C-GQA
+datasets show the superior performance of the PLID to the prior arts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-Earth Satellite Orbit Determination Using Deep Convolutional
+  Networks with Satellite Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Khorana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the critical roles that satellites play in national defense, public
+safety, and worldwide communications, finding ways to determine satellite
+trajectories is a crucially important task for improved space situational
+awareness. However, it is increasingly common for satellites to lose connection
+to the ground stations with which they communicate due to signal interruptions
+from the Earth's ionosphere and magnetosphere, among other interferences. In
+this work, we propose utilizing a computer vision based approach that relies on
+images of the Earth taken by the satellite in real-time to predict its orbit
+upon losing contact with ground stations. In contrast with other works, we
+train neural networks on an image-based dataset and show that the neural
+networks outperform the de facto standard in orbit determination (the Kalman
+filter) in the scenario where the satellite has lost connection with its
+ground-based station. Moreover, our approach does not require $\textit{a
+priori}$ knowledge of the satellite's state and it takes into account the
+external factors influencing the satellite's motion using images taken in
+real-time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Type-to-Track: Retrieve Any Object via <span class="highlight-title">Prompt</span>-based Tracking <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pha Nguyen, Kha Gia Quach, Kris Kitani, Khoa Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the recent trends in vision problems is to use natural language
+captions to describe the objects of interest. This approach can overcome some
+limitations of traditional methods that rely on bounding boxes or category
+annotations. This paper introduces a novel paradigm for Multiple Object
+Tracking called Type-to-Track, which allows users to track objects in videos by
+typing natural language descriptions. We present a new dataset for that
+Grounded Multiple Object Tracking task, called GroOT, that contains videos with
+various types of objects and their corresponding textual captions describing
+their appearance and action in detail. Additionally, we introduce two new
+evaluation protocols and formulate evaluation metrics specifically for this
+task. We develop a new efficient method that models a transformer-based
+eMbed-ENcoDE-extRact framework (MENDER) using the third-order tensor
+decomposition. The experiments in five scenarios show that our MENDER approach
+outperforms another two-stage design in terms of accuracy and efficiency, up to
+14.7% accuracy and 4$\times$ speed faster.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023. Project page:
+  https://uark-cviu.github.io/Type-to-Track/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HyP-NeRF: Learning Improved NeRF Priors using a HyperNetwork 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bipasha Sen, Gaurav Singh, Aditya Agarwal, Rohith Agaram, K Madhava Krishna, Srinath Sridhar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) have become an increasingly popular
+representation to capture high-quality appearance and shape of scenes and
+objects. However, learning generalizable NeRF priors over categories of scenes
+or objects has been challenging due to the high dimensionality of network
+weight space. To address the limitations of existing work on generalization,
+multi-view consistency and to improve quality, we propose HyP-NeRF, a latent
+conditioning method for learning generalizable category-level NeRF priors using
+hypernetworks. Rather than using hypernetworks to estimate only the weights of
+a NeRF, we estimate both the weights and the multi-resolution hash encodings
+resulting in significant quality gains. To improve quality even further, we
+incorporate a denoise and finetune strategy that denoises images rendered from
+NeRFs estimated by the hypernetwork and finetunes it while retaining multiview
+consistency. These improvements enable us to use HyP-NeRF as a generalizable
+prior for multiple downstream tasks including NeRF reconstruction from
+single-view or cluttered scenes and text-to-NeRF. We provide qualitative
+comparisons and evaluate HyP-NeRF on three tasks: generalization, compression,
+and retrieval, demonstrating our state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Controllable Inversion of Black-Box Face Recognition Models via
+  Diffusion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Kansy, Anton Raël, Graziana Mignone, Jacek Naruniec, Christopher Schroers, Markus Gross, Romann M. Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition models embed a face image into a low-dimensional identity
+vector containing abstract encodings of identity-specific facial features that
+allow individuals to be distinguished from one another. We tackle the
+challenging task of inverting the latent space of pre-trained face recognition
+models without full model access (i.e. black-box setting). A variety of methods
+have been proposed in literature for this task, but they have serious
+shortcomings such as a lack of realistic outputs and strong requirements for
+the data set and accessibility of the face recognition model. By analyzing the
+black-box inversion problem, we show that the conditional diffusion model loss
+naturally emerges and that we can effectively sample from the inverse
+distribution even without an identity-specific loss. Our method, named identity
+denoising diffusion probabilistic model (ID3PM), leverages the stochastic
+nature of the denoising diffusion process to produce high-quality,
+identity-preserving face images with various backgrounds, lighting, poses, and
+expressions. We demonstrate state-of-the-art performance in terms of identity
+preservation and diversity both qualitatively and quantitatively, and our
+method is the first black-box face recognition model inversion method that
+offers intuitive control over the generation process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages main paper + 23 pages supplementary material. Moderate
+  revisions from v1 (different template, added user study, wording). Presented
+  at AMFG workshop at ICCV 2023. Project page:
+  https://studios.disneyresearch.com/2023/10/02/controllable-inversion-of-black-box-face-recognition-models-via-diffusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking pose estimation in crowds: overcoming the detection
+  information-bottleneck and ambiguity <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07879v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07879v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mu Zhou, Lucas Stoffl, Mackenzie Weygandt Mathis, Alexander Mathis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frequent interactions between individuals are a fundamental challenge for
+pose estimation algorithms. Current pipelines either use an object detector
+together with a pose estimator (top-down approach), or localize all body parts
+first and then link them to predict the pose of individuals (bottom-up). Yet,
+when individuals closely interact, top-down methods are ill-defined due to
+overlapping individuals, and bottom-up methods often falsely infer connections
+to distant bodyparts. Thus, we propose a novel pipeline called bottom-up
+conditioned top-down pose estimation (BUCTD) that combines the strengths of
+bottom-up and top-down methods. Specifically, we propose to use a bottom-up
+model as the detector, which in addition to an estimated bounding box provides
+a pose proposal that is fed as condition to an attention-based top-down model.
+We demonstrate the performance and efficiency of our approach on animal and
+human pose estimation benchmarks. On CrowdPose and OCHuman, we outperform
+previous state-of-the-art models by a significant margin. We achieve 78.5 AP on
+CrowdPose and 48.5 AP on OCHuman, an improvement of 8.6% and 7.8% over the
+prior art, respectively. Furthermore, we show that our method strongly improves
+the performance on multi-animal benchmarks involving fish and monkeys. The code
+is available at https://github.com/amathislab/BUCTD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICCV 2023; Code at https://github.com/amathislab/BUCTD
+  Video at https://www.youtube.com/watch?v=BHZnA-CZeZY</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-knowledge Inspired Pseudo Supervision (DIPS) for Unsupervised
+  Image-to-Image Translation Models to Support Cross-Domain Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10310v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10310v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Firas Al-Hindawi, Md Mahfuzur Rahman Siddiquee, Teresa Wu, Han Hu, Ying Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to classify images is dependent on having access to large labeled
+datasets and testing on data from the same domain that the model can train on.
+Classification becomes more challenging when dealing with new data from a
+different domain, where gathering and especially labeling a larger image
+dataset for retraining a classification model requires a labor-intensive human
+effort. Cross-domain classification frameworks were developed to handle this
+data domain shift problem by utilizing unsupervised image-to-image translation
+models to translate an input image from the unlabeled domain to the labeled
+domain. The problem with these unsupervised models lies in their unsupervised
+nature. For lack of annotations, it is not possible to use the traditional
+supervised metrics to evaluate these translation models to pick the best-saved
+checkpoint model. This paper introduces a new method called Domain-knowledge
+Inspired Pseudo Supervision (DIPS) which utilizes domain-informed Gaussian
+Mixture Models to generate pseudo annotations to enable the use of traditional
+supervised metrics. This method was designed specifically to support
+cross-domain classification applications contrary to other typically used
+metrics such as the FID which were designed to evaluate the model in terms of
+the quality of the generated image from a human-eye perspective. DIPS proves
+its effectiveness by outperforming various GAN evaluation metrics, including
+FID, when selecting the optimal saved checkpoint model. It is also evaluated
+against truly supervised metrics. Furthermore, DIPS showcases its robustness
+and interpretability by demonstrating a strong correlation with truly
+supervised metrics, highlighting its superiority over existing state-of-the-art
+alternatives. The code and data to replicate the results can be found on the
+official Github repository: https://github.com/Hindawi91/DIPS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2212.09107</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lightweight, <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span>s for Remote Sensing Timeseries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14065v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14065v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Tseng, Ruben Cartuyvels, Ivan Zvonkov, Mirali Purohit, David Rolnick, Hannah Kerner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models for parsing remote sensing data have a wide range of
+societally relevant applications, but labels used to train these models can be
+difficult or impossible to acquire. This challenge has spurred research into
+self-supervised learning for remote sensing data aiming to unlock the use of
+machine learning in geographies or application domains where labelled datasets
+are small. Current self-supervised learning approaches for remote sensing data
+draw significant inspiration from techniques applied to natural images.
+However, remote sensing data has important differences from natural images --
+for example, the temporal dimension is critical for many tasks and data is
+collected from many complementary sensors. We show we can create significantly
+smaller performant models by designing architectures and self-supervised
+training techniques specifically for remote sensing data. We introduce the
+Pretrained Remote Sensing Transformer (Presto), a transformer-based model
+pre-trained on remote sensing pixel-timeseries data. Presto excels at a wide
+variety of globally distributed remote sensing tasks and performs competitively
+with much larger models while requiring far less compute. Presto can be used
+for transfer learning or as a feature extractor for simple models, enabling
+efficient deployment at scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning A Locally Unified 3D Point Cloud for View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05013v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05013v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng You, Mantang Guo, Xianqiang Lyu, Hui Liu, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore the problem of 3D point cloud representation-based
+view synthesis from a set of sparse source views. To tackle this challenging
+problem, we propose a new deep learning-based view synthesis paradigm that
+learns a locally unified 3D point cloud from source views. Specifically, we
+first construct sub-point clouds by projecting source views to 3D space based
+on their depth maps. Then, we learn the locally unified 3D point cloud by
+adaptively fusing points at a local neighborhood defined on the union of the
+sub-point clouds. Besides, we also propose a 3D geometry-guided image
+restoration module to fill the holes and recover high-frequency details of the
+rendered novel views. Experimental results on three benchmark datasets
+demonstrate that our method can improve the average PSNR by more than 4 dB
+while preserving more accurate visual details, compared with state-of-the-art
+view synthesis methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAM Struggles in Concealed Scenes -- Empirical Study on "Segment
+  Anything" <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06022v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06022v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge-Peng Ji, Deng-Ping Fan, Peng Xu, Ming-Ming Cheng, Bowen Zhou, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmenting anything is a ground-breaking step toward artificial general
+intelligence, and the Segment Anything Model (SAM) greatly fosters the
+foundation models for computer vision. We could not be more excited to probe
+the performance traits of SAM. In particular, exploring situations in which SAM
+does not perform well is interesting. In this report, we choose three concealed
+scenes, i.e., camouflaged animals, industrial defects, and medical lesions, to
+evaluate SAM under unprompted settings. Our main observation is that SAM looks
+unskilled in concealed scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SCIENCE CHINA Information Sciences, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Pruning for Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10924v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10924v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gongfan Fang, Xinyin Ma, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative modeling has recently undergone remarkable advancements, primarily
+propelled by the transformative implications of Diffusion Probabilistic Models
+(DPMs). The impressive capability of these models, however, often entails
+significant computational overhead during both training and inference. To
+tackle this challenge, we present Diff-Pruning, an efficient compression method
+tailored for learning lightweight diffusion models from pre-existing ones,
+without the need for extensive re-training. The essence of Diff-Pruning is
+encapsulated in a Taylor expansion over pruned timesteps, a process that
+disregards non-contributory diffusion steps and ensembles informative gradients
+to identify important weights. Our empirical assessment, undertaken across
+several datasets highlights two primary benefits of our proposed method: 1)
+Efficiency: it enables approximately a 50\% reduction in FLOPs at a mere 10\%
+to 20\% of the original training expenditure; 2) Consistency: the pruned
+diffusion models inherently preserve generative behavior congruent with their
+pre-trained models. Code is available at
+\url{https://github.com/VainF/Diff-Pruning}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Geometric Perspective on Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19947v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19947v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Defang Chen, Zhenyu Zhou, Jian-Ping Mei, Chunhua Shen, Chun Chen, Can Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed significant progress in developing effective
+training and fast sampling techniques for diffusion models. A remarkable
+advancement is the use of stochastic differential equations (SDEs) and their
+marginal-preserving ordinary differential equations (ODEs) to describe data
+perturbation and generative modeling in a unified framework. In this paper, we
+carefully inspect the ODE-based sampling of a popular variance-exploding SDE
+and reveal several intriguing structures of its sampling dynamics. We discover
+that the data distribution and the noise distribution are smoothly connected
+with a quasi-linear sampling trajectory and another implicit denoising
+trajectory that even converges faster. Meanwhile, the denoising trajectory
+governs the curvature of the corresponding sampling trajectory and its various
+finite differences yield all second-order samplers used in practice.
+Furthermore, we establish a theoretical relationship between the optimal
+ODE-based sampling and the classic mean-shift (mode-seeking) algorithm, with
+which we can characterize the asymptotic behavior of diffusion models and
+identify the empirical score deviation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Score-based Conditional Generation with Fewer Labeled Data by
+  Self-calibrating Classifier Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Kuo-Ming Huang, Si-An Chen, Hsuan-Tien Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based generative models (SGMs) are a popular family of deep generative
+models that achieve leading image generation quality. Early studies extend SGMs
+to tackle class-conditional generation by coupling an unconditional SGM with
+the guidance of a trained classifier. Nevertheless, such classifier-guided SGMs
+do not always achieve accurate conditional generation, especially when trained
+with fewer labeled data. We argue that the problem is rooted in the
+classifier's tendency to overfit without coordinating with the underlying
+unconditional distribution. We propose improving classifier-guided SGMs by
+letting the classifier regularize itself to respect the unconditional
+distribution. Our key idea is to use principles from energy-based models to
+convert the classifier as another view of the unconditional SGM. Then, existing
+loss for the unconditional SGM can be leveraged to achieve regularization by
+calibrating the classifier's internal unconditional scores. The regularization
+scheme can be applied to not only the labeled data but also unlabeled ones to
+further improve the classifier. Empirical results show that the proposed
+approach significantly improves conditional generation quality across various
+percentages of fewer labeled data. The results confirm the potential of the
+proposed approach for generative modeling with limited labeled data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alleviating Exposure Bias in Diffusion Models through Sampling with
+  Shifted Time Steps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15583v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15583v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxiao Li, Tingyu Qu, Ruicong Yao, Wei Sun, Marie-Francine Moens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the
+synthesis of high-quality images. However, their inference process
+characteristically requires numerous, potentially hundreds, of iterative steps,
+which could exaggerate the problem of exposure bias due to the training and
+inference discrepancy. Previous work has attempted to mitigate this issue by
+perturbing inputs during training, which consequently mandates the retraining
+of the DPM. In this work, we conduct a systematic study of exposure bias in DPM
+and, intriguingly, we find that the exposure bias could be alleviated with a
+novel sampling method that we propose, without retraining the model. We
+empirically and theoretically show that, during inference, for each backward
+time step $t$ and corresponding state $\hat{x}_t$, there might exist another
+time step $t_s$ which exhibits superior coupling with $\hat{x}_t$. Based on
+this finding, we introduce a sampling method named Time-Shift Sampler. Our
+framework can be seamlessly integrated to existing sampling algorithms, such as
+DDPM, DDIM and other high-order solvers, inducing merely minimal additional
+computations. Experimental results show our method brings significant and
+consistent improvements in FID scores on different datasets and sampling
+methods. For example, integrating Time-Shift Sampler to F-PNDM yields a
+FID=3.88, achieving 44.49\% improvements as compared to F-PNDM, on CIFAR-10
+with 10 sampling steps, which is more performant than the vanilla DDIM with 100
+sampling steps. We will release the code upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Growth: Real-time CNN Layer Expansion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03049v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03049v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunjie Zhu, Yunhao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) have shown unparalleled achievements in numerous
+applications, reflecting their proficiency in managing vast data sets. Yet,
+their static structure limits their adaptability in ever-changing environments.
+This research presents a new algorithm that allows the convolutional layer of a
+Convolutional Neural Network (CNN) to dynamically evolve based on data input,
+while still being seamlessly integrated into existing DNNs. Instead of a rigid
+architecture, our approach iteratively introduces kernels to the convolutional
+layer, gauging its real-time response to varying data. This process is refined
+by evaluating the layer's capacity to discern image features, guiding its
+growth. Remarkably, our unsupervised method has outstripped its supervised
+counterparts across diverse datasets like MNIST, Fashion-MNIST, CIFAR-10, and
+CIFAR-100. It also showcases enhanced adaptability in transfer learning
+scenarios. By introducing a data-driven model scalability strategy, we are
+filling a void in deep learning, leading to more flexible and efficient DNNs
+suited for dynamic settings.
+Code:(https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code:
+  https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tracking Passengers and Baggage Items using Multiple Overhead Cameras at
+  Security Checkpoints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abubakar Siddique, Henry Medeiros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel framework to track multiple objects in overhead camera
+videos for airport checkpoint security scenarios where targets correspond to
+passengers and their baggage items. We propose a Self-Supervised Learning (SSL)
+technique to provide the model information about instance segmentation
+uncertainty from overhead images. Our SSL approach improves object detection by
+employing a test-time data augmentation and a regression-based,
+rotation-invariant pseudo-label refinement technique. Our pseudo-label
+generation method provides multiple geometrically-transformed images as inputs
+to a Convolutional Neural Network (CNN), regresses the augmented detections
+generated by the network to reduce localization errors, and then clusters them
+using the mean-shift algorithm. The self-supervised detector model is used in a
+single-camera tracking algorithm to generate temporal identifiers for the
+targets. Our method also incorporates a multi-view trajectory association
+mechanism to maintain consistent temporal identifiers as passengers travel
+across camera views. An evaluation of detection, tracking, and association
+performances on videos obtained from multiple overhead cameras in a realistic
+airport checkpoint environment demonstrates the effectiveness of the proposed
+approach. Our results show that self-supervision improves object detection
+accuracy by up to $42\%$ without increasing the inference time of the model.
+Our multi-camera association method achieves up to $89\%$ multi-object tracking
+accuracy with an average computation time of less than $15$ ms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Need to replace already published arxiv version of this work. This
+  work will be the latest version of the previously published arXiv:2007.07924</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revealing the Illusion of Joint Multimodal Understanding in VideoQA
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08889v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08889v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishaan Singh Rawal, Shantanu Jaiswal, Basura Fernando, Cheston Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While VideoQA Transformer models demonstrate competitive performance on
+standard benchmarks, the reasons behind their success are not fully understood.
+Do these models jointly capture and leverage the rich multimodal structures and
+dynamics from video and text? Or are they merely exploiting shortcuts to
+achieve high scores? Hence, we design $\textit{QUAG}$ (QUadrant AveraGe), a
+lightweight and non-parametric probe, to critically analyze multimodal
+representations. QUAG facilitates combined dataset-model study by systematic
+ablation of model's coupled multimodal understanding during inference.
+Surprisingly, it demonstrates that the models manage to maintain high
+performance even under multimodal impairment. We extend QUAG to design
+"QUAG-attention", a simplistic and less-expressive replacement of
+self-attention. We find that the models with QUAG-attention achieve similar
+performance with significantly less mulops without any finetuning. These
+findings indicate that the current VideoQA benchmarks and metrics do not
+penalize models that find shortcuts and discount joint multimodal
+understanding. Motivated by this, we propose the $\textit{CLAVI}$
+(Counterfactual in LAnguage and VIdeo), a diagnostic dataset for coupled
+multimodal understanding in VideoQA. CLAVI consists of temporal questions and
+videos that are augmented to curate balanced counterfactuals in language and
+video domains. We evaluate models on CLAVI and find that all models achieve
+high performance on multimodal shortcut instances, but most of them have poor
+performance on the counterfactual instances that necessitate joint multimodal
+understanding. Overall, with the multimodal representation analysis using QUAG
+and diagnostic analysis using CLAVI, we show that many VideoQA models are
+incapable of learning multimodal representations and that their success on
+standard datasets is an illusion of joint multimodal understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SegTransVAE: Hybrid CNN -- <span class="highlight-title">Transformer</span> with Regularization for medical
+  image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.08582v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.08582v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan-Dung Pham, Hai Nguyen-Truong, Nam Nguyen Phuong, Khoa N. A. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current research on deep learning for medical image segmentation exposes
+their limitations in learning either global semantic information or local
+contextual information. To tackle these issues, a novel network named
+SegTransVAE is proposed in this paper. SegTransVAE is built upon
+encoder-decoder architecture, exploiting transformer with the variational
+autoencoder (VAE) branch to the network to reconstruct the input images jointly
+with segmentation. To the best of our knowledge, this is the first method
+combining the success of CNN, transformer, and VAE. Evaluation on various
+recently introduced datasets shows that SegTransVAE outperforms previous
+methods in Dice Score and $95\%$-Haudorff Distance while having comparable
+inference time to a simple CNN-based architecture network. The source code is
+available at: https://github.com/itruonghai/SegTransVAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstruction guided Meta-learning for Few Shot Open Set Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.00340v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.00340v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sayak Nag, Dripta S. Raychaudhuri, Sujoy Paul, Amit K. Roy-Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications, we are constrained to learn classifiers from very
+limited data (few-shot classification). The task becomes even more challenging
+if it is also required to identify samples from unknown categories (open-set
+classification). Learning a good abstraction for a class with very few samples
+is extremely difficult, especially under open-set settings. As a result,
+open-set recognition has received minimal attention in the few-shot setting.
+However, it is a critical task in many applications like environmental
+monitoring, where the number of labeled examples for each class is limited.
+Existing few-shot open-set recognition (FSOSR) methods rely on thresholding
+schemes, with some considering uniform probability for open-class samples.
+However, this approach is often inaccurate, especially for fine-grained
+categorization, and makes them highly sensitive to the choice of a threshold.
+To address these concerns, we propose Reconstructing Exemplar-based Few-shot
+Open-set ClaSsifier (ReFOCS). By using a novel exemplar reconstruction-based
+meta-learning strategy ReFOCS streamlines FSOSR eliminating the need for a
+carefully tuned threshold by learning to be self-aware of the openness of a
+sample. The exemplars, act as class representatives and can be either provided
+in the training dataset or estimated in the feature domain. By testing on a
+wide variety of datasets, we show ReFOCS to outperform multiple
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions in Pattern Analysis and
+  Machine Intelligence (TPAMI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity
+  Human-centric Rendering <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10173v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10173v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Cheng, Ruixiang Chen, Wanqi Yin, Siming Fan, Keyu Chen, Honglin He, Huiwen Luo, Zhongang Cai, Jingbo Wang, Yang Gao, Zhengming Yu, Zhengyu Lin, Daxuan Ren, Lei Yang, Ziwei Liu, Chen Change Loy, Chen Qian, Wayne Wu, Dahua Lin, Bo Dai, Kwan-Yee Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realistic human-centric rendering plays a key role in both computer vision
+and computer graphics. Rapid progress has been made in the algorithm aspect
+over the years, yet existing human-centric rendering datasets and benchmarks
+are rather impoverished in terms of diversity, which are crucial for rendering
+effect. Researchers are usually constrained to explore and evaluate a small set
+of rendering problems on current datasets, while real-world applications
+require methods to be robust across different scenarios. In this work, we
+present DNA-Rendering, a large-scale, high-fidelity repository of human
+performance data for neural actor rendering. DNA-Rendering presents several
+alluring attributes. First, our dataset contains over 1500 human subjects, 5000
+motion sequences, and 67.5M frames' data volume. Second, we provide rich assets
+for each subject -- 2D/3D human body keypoints, foreground masks, SMPLX models,
+cloth/accessory materials, multi-view images, and videos. These assets boost
+the current method's accuracy on downstream rendering tasks. Third, we
+construct a professional multi-view system to capture data, which contains 60
+synchronous cameras with max 4096 x 3000 resolution, 15 fps speed, and stern
+camera calibration steps, ensuring high-quality resources for task training and
+evaluation. Along with the dataset, we provide a large-scale and quantitative
+benchmark in full-scale, with multiple tasks to evaluate the existing progress
+of novel view synthesis, novel pose animation synthesis, and novel identity
+rendering methods. In this manuscript, we describe our DNA-Rendering effort as
+a revealing of new observations, challenges, and future directions to
+human-centric rendering. The dataset, code, and benchmarks will be publicly
+available at https://dna-rendering.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by ICCV2023. Project page:
+  https://dna-rendering.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViDA: Homeostatic Visual Domain Adapter for Continual Test Time
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Liu, Senqiao Yang, Peidong Jia, Renrui Zhang, Ming Lu, Yandong Guo, Wei Xue, Shanghang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since real-world machine systems are running in non-stationary environments,
+Continual Test-Time Adaptation (CTTA) task is proposed to adapt the pre-trained
+model to continually changing target domains. Recently, existing methods mainly
+focus on model-based adaptation, which aims to leverage a self-training manner
+to extract the target domain knowledge. However, pseudo labels can be noisy and
+the updated model parameters are unreliable under dynamic data distributions,
+leading to error accumulation and catastrophic forgetting in the continual
+adaptation process. To tackle these challenges and maintain the model
+plasticity, we tactfully design a Visual Domain Adapter (ViDA) for CTTA,
+explicitly handling both domain-specific and domain-shared knowledge.
+Specifically, we first comprehensively explore the different domain
+representations of the adapters with trainable high-rank or low-rank embedding
+spaces. Then we inject ViDAs into the pre-trained model, which leverages
+high-rank and low-rank features to adapt the current domain distribution and
+maintain the continual domain-shared knowledge, respectively. To exploit the
+low-rank and high-rank ViDAs more effectively, we further propose a Homeostatic
+Knowledge Allotment (HKA) strategy, which adaptively combines different
+knowledge from each ViDA. Extensive experiments conducted on four widely used
+benchmarks demonstrate that our proposed method achieves state-of-the-art
+performance in both classification and segmentation CTTA tasks. Note that, our
+method can be regarded as a novel transfer paradigm for large-scale models,
+delivering promising results in adaptation to continually changing
+distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neurips2023 final Rating: Weak Accept; Weak Accept; Borderline
+  accept; Borderline accept</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge-Aware Federated Active Learning with Non-IID Data <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13579v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13579v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Tong Cao, Ye Shi, Baosheng Yu, Jingya Wang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning enables multiple decentralized clients to learn
+collaboratively without sharing the local training data. However, the expensive
+annotation cost to acquire data labels on local clients remains an obstacle in
+utilizing local data. In this paper, we propose a federated active learning
+paradigm to efficiently learn a global model with limited annotation budget
+while protecting data privacy in a decentralized learning way. The main
+challenge faced by federated active learning is the mismatch between the active
+sampling goal of the global model on the server and that of the asynchronous
+local clients. This becomes even more significant when data is distributed
+non-IID across local clients. To address the aforementioned challenge, we
+propose Knowledge-Aware Federated Active Learning (KAFAL), which consists of
+Knowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory
+Federated Update (KCFU). KSAS is a novel active sampling method tailored for
+the federated active learning problem. It deals with the mismatch challenge by
+sampling actively based on the discrepancies between local and global models.
+KSAS intensifies specialized knowledge in local clients, ensuring the sampled
+data to be informative for both the local clients and the global model. KCFU,
+in the meantime, deals with the client heterogeneity caused by limited data and
+non-IID data distributions. It compensates for each client's ability in weak
+classes by the assistance of the global model. Extensive experiments and
+analyses are conducted to show the superiority of KSAS over the
+state-of-the-art active learning methods and the efficiency of KCFU under the
+federated active learning framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 12 figures, ICCV23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-tuning Multimodal LLMs to Follow Zero-shot Demonstrative
+  Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04152v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04152v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, Yueting Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Multimodal Large Language Models (MLLMs) have been
+utilizing Visual Prompt Generators (VPGs) to convert visual features into
+tokens that LLMs can recognize. This is achieved by training the VPGs on
+millions of image-caption pairs, where the VPG-generated tokens of images are
+fed into a frozen LLM to generate the corresponding captions. However, this
+image-captioning based training objective inherently biases the VPG to
+concentrate solely on the primary visual contents sufficient for caption
+generation, often neglecting other visual details. This shortcoming results in
+MLLMs' underperformance in comprehending demonstrative instructions consisting
+of multiple, interleaved, and multimodal instructions that demonstrate the
+required context to complete a task. To address this issue, we introduce a
+generic and lightweight Visual Prompt Generator Complete module (VPG-C), which
+can infer and complete the missing details essential for comprehending
+demonstrative instructions. Further, we propose a synthetic discriminative
+training strategy to fine-tune VPG-C, eliminating the need for supervised
+demonstrative instructions. As for evaluation, we build DEMON, a comprehensive
+benchmark for demonstrative instruction understanding. Synthetically trained
+with the proposed strategy, VPG-C achieves significantly stronger zero-shot
+performance across all tasks of DEMON. Further evaluation on the MME and
+OwlEval benchmarks also demonstrate the superiority of VPG-C. Our benchmark,
+code, and pre-trained models are available at
+https://github.com/DCDmllm/Cheetah.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Hours to Seconds: Towards 100x Faster Quantitative Phase Imaging
+  via Differentiable Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.11521v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.11521v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Udith Haputhanthri, Kithmini Herath, Ramith Hettiarachchi, Hasindu Kariyawasam, Azeem Ahmad, Balpreet S. Ahluwalia, Chamira U. S. Edussooriya, Dushan N. Wadduwage
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With applications ranging from metabolomics to histopathology, quantitative
+phase microscopy (QPM) is a powerful label-free imaging modality. Despite
+significant advances in fast multiplexed imaging sensors and
+deep-learning-based inverse solvers, the throughput of QPM is currently limited
+by the speed of electronic hardware. Complementarily, to improve throughput
+further, here we propose to acquire images in a compressed form such that more
+information can be transferred beyond the existing electronic hardware
+bottleneck. To this end, we present a learnable optical
+compression-decompression framework that learns content-specific features. The
+proposed differentiable quantitative phase microscopy ($\partial \mu$) first
+uses learnable optical feature extractors as image compressors. The intensity
+representation produced by these networks is then captured by the imaging
+sensor. Finally, a reconstruction network running on electronic hardware
+decompresses the QPM images. In numerical experiments, the proposed system
+achieves compression of $\times$ 64 while maintaining the SSIM of $\sim 0.90$
+and PSNR of $\sim 30$ dB on cells. The results demonstrated by our experiments
+open up a new pathway for achieving end-to-end optimized (i.e., optics and
+electronic) compact QPM systems that may provide unprecedented throughput
+improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSCBench: Monocular 3D Semantic Scene Completion Benchmark in Street
+  Views 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09001v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09001v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Li, Sihang Li, Xinhao Liu, Moonjun Gong, Kenan Li, Nuo Chen, Zijun Wang, Zhiheng Li, Tao Jiang, Fisher Yu, Yue Wang, Hang Zhao, Zhiding Yu, Chen Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular scene understanding is a foundational component of autonomous
+systems. Within the spectrum of monocular perception topics, one crucial and
+useful task for holistic 3D scene understanding is semantic scene completion
+(SSC), which jointly completes semantic information and geometric details from
+RGB input. However, progress in SSC, particularly in large-scale street views,
+is hindered by the scarcity of high-quality datasets. To address this issue, we
+introduce SSCBench, a comprehensive benchmark that integrates scenes from
+widely used automotive datasets (e.g., KITTI-360, nuScenes, and Waymo).
+SSCBench follows an established setup and format in the community, facilitating
+the easy exploration of SSC methods in various street views. We benchmark
+models using monocular, trinocular, and point cloud input to assess the
+performance gap resulting from sensor coverage and modality. Moreover, we have
+unified semantic labels across diverse datasets to simplify cross-domain
+generalization testing. We commit to including more datasets and SSC models to
+drive further advancements in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kinship Representation Learning with Face Componential Relation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04546v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04546v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weng-Tai Su, Min-Hung Chen, Chien-Yi Wang, Shang-Hong Lai, Trista Pei-Chun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kinship recognition aims to determine whether the subjects in two facial
+images are kin or non-kin, which is an emerging and challenging problem.
+However, most previous methods focus on heuristic designs without considering
+the spatial correlation between face images. In this paper, we aim to learn
+discriminative kinship representations embedded with the relation information
+between face components (e.g., eyes, nose, etc.). To achieve this goal, we
+propose the Face Componential Relation Network, which learns the relationship
+between face components among images with a cross-attention mechanism, which
+automatically learns the important facial regions for kinship recognition.
+Moreover, we propose Face Componential Relation Network (FaCoRNet), which
+adapts the loss function by the guidance from cross-attention to learn more
+discriminative feature representations. The proposed FaCoRNet outperforms
+previous state-of-the-art methods by large margins for the largest public
+kinship recognition FIW benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Workshop (Analysis and Modeling of Faces and Gestures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyperbolic Active Learning for Semantic Segmentation under Domain Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11180v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11180v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Franco, Paolo Mandica, Konstantinos Kallidromitis, Devin Guillory, Yu-Teng Li, Trevor Darrell, Fabio Galasso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a hyperbolic neural network approach to pixel-level active
+learning for semantic segmentation, and propose a novel geometric
+interpretation of the hyperbolic geometry that arises bottom-up from the
+statistics of the data. In our formulation the hyperbolic radius emerges as an
+estimator of the unexplained class complexity, which encompasses the class
+intrinsic complexity and its scarcity in the dataset. The unexplained class
+complexity serves as a metric indicating the likelihood that acquiring a
+particular pixel would contribute to enhancing the data information. We combine
+this quantity with prediction uncertainty to compute an acquisition score that
+identifies the most informative pixels for oracle annotation. Our proposed HALO
+(Hyperbolic Active Learning Optimization) sets a new state-of-the-art in active
+learning for semantic segmentation under domain shift, and surpasses the
+supervised domain adaptation performance while only using a small portion of
+labels (i.e., 1%). We perform extensive experimental analysis based on two
+established benchmarks, i.e. GTAV $\rightarrow$ Cityscapes and SYNTHIA
+$\rightarrow$ Cityscapes, and we additionally test on Cityscape $\rightarrow$
+ACDC under adverse weather conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-Domain Dialogue Quality Evaluation: Deriving Nugget-level Scores
+  from Turn-level Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rikiya Takehi, Akihisa Watanabe, Tetsuya Sakai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing dialogue quality evaluation systems can return a score for a given
+system turn from a particular viewpoint, e.g., engagingness. However, to
+improve dialogue systems by locating exactly where in a system turn potential
+problems lie, a more fine-grained evaluation may be necessary. We therefore
+propose an evaluation approach where a turn is decomposed into nuggets (i.e.,
+expressions associated with a dialogue act), and nugget-level evaluation is
+enabled by leveraging an existing turn-level evaluation system. We demonstrate
+the potential effectiveness of our evaluation method through a case study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiskANN++: Efficient Page-based Search over Isomorphic Mapped Graph
+  Index using Query-sensitivity Entry Vertex 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiongkang Ni, Xiaoliang Xu, Yuxiang Wang, Can Li, Jiajie Yao, Shihai Xiao, Xuecang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a vector dataset $\mathcal{X}$ and a query vector $\vec{x}_q$,
+graph-based Approximate Nearest Neighbor Search (ANNS) aims to build a graph
+index $G$ and approximately return vectors with minimum distances to
+$\vec{x}_q$ by searching over $G$. The main drawback of graph-based ANNS is
+that a graph index would be too large to fit into the memory especially for a
+large-scale $\mathcal{X}$. To solve this, a Product Quantization (PQ)-based
+hybrid method called DiskANN is proposed to store a low-dimensional PQ index in
+memory and retain a graph index in SSD, thus reducing memory overhead while
+ensuring a high search accuracy. However, it suffers from two I/O issues that
+significantly affect the overall efficiency: (1) long routing path from an
+entry vertex to the query's neighborhood that results in large number of I/O
+requests and (2) redundant I/O requests during the routing process. We propose
+an optimized DiskANN++ to overcome above issues. Specifically, for the first
+issue, we present a query-sensitive entry vertex selection strategy to replace
+DiskANN's static graph-central entry vertex by a dynamically determined entry
+vertex that is close to the query. For the second I/O issue, we present an
+isomorphic mapping on DiskANN's graph index to optimize the SSD layout and
+propose an asynchronously optimized Pagesearch based on the optimized SSD
+layout as an alternative to DiskANN's beamsearch. Comprehensive experimental
+studies on eight real-world datasets demonstrate our DiskANN++'s superiority on
+efficiency. We achieve a notable 1.5 X to 2.2 X improvement on QPS compared to
+DiskANN, given the same accuracy constraint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages including references, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness for All: Investigating Harms to Within-Group Individuals in
+  Producer Fairness Re-ranking Optimization -- A Reproducibility Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09277v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09277v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Pellegrini, Vittorio Maria Faraco, Yashar Deldjoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are widely used to provide personalized recommendations
+to users. Recent research has shown that recommender systems may be subject to
+different types of biases, such as popularity bias, leading to an uneven
+distribution of recommendation exposure among producer groups. To mitigate
+this, producer-centered fairness re-ranking (PFR) approaches have been proposed
+to ensure equitable recommendation utility across groups. However, these
+approaches overlook the harm they may cause to within-group individuals
+associated with colder items, which are items with few or no interactions.
+  This study reproduces previous PFR approaches and shows that they
+significantly harm colder items, leading to a fairness gap for these items in
+both advantaged and disadvantaged groups. Surprisingly, the unfair base
+recommendation models were providing greater exposure opportunities to these
+individual cold items, even though at the group level, they appeared to be
+unfair. To address this issue, the study proposes an amendment to the PFR
+approach that regulates the number of colder items recommended by the system.
+This modification achieves a balance between accuracy and producer fairness
+while optimizing the selection of colder items within each group, thereby
+preventing or reducing harm to within-group individuals and augmenting the
+novelty of all recommended items. The proposed method is able to register an
+increase in sub-group fairness (SGF) from 0.3104 to 0.3782, 0.6156, and 0.9442
+while also improving group-level fairness (GF) (112% and 37% with respect to
+base models and traditional PFR). Moreover, the proposed method achieves these
+improvements with minimal or no reduction in accuracy (or even an increase
+sometimes). We evaluate the proposed method on various recommendation datasets
+and demonstrate promising results independent of the underlying model or
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data augmentation and refinement for recommender system: A
+  semi-supervised approach using maximum margin matrix factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13050v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13050v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamal Shaikh, Venkateswara Rao Kagita, Vikas Kumar, Arun K Pujari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative filtering (CF) has become a popular method for developing
+recommender systems (RSs) where ratings of a user for new items are predicted
+based on her past preferences and available preference information of other
+users. Despite the popularity of CF-based methods, their performance is often
+greatly limited by the sparsity of observed entries. In this study, we explore
+the data augmentation and refinement aspects of Maximum Margin Matrix
+Factorization (MMMF), a widely accepted CF technique for rating predictions,
+which has not been investigated before. We exploit the inherent characteristics
+of CF algorithms to assess the confidence level of individual ratings and
+propose a semi-supervised approach for rating augmentation based on
+self-training. We hypothesize that any CF algorithm's predictions with low
+confidence are due to some deficiency in the training data and hence, the
+performance of the algorithm can be improved by adopting a systematic data
+augmentation strategy. We iteratively use some of the ratings predicted with
+high confidence to augment the training data and remove low-confidence entries
+through a refinement process. By repeating this process, the system learns to
+improve prediction accuracy. Our method is experimentally evaluated on several
+state-of-the-art CF algorithms and leads to informative rating augmentation,
+improving the performance of the baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">39</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LinGCN: Structural Linearized Graph Convolutional Network for
+  Homomorphically Encrypted Inference <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14331v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14331v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongwu Peng, Ran Ran, Yukui Luo, Jiahui Zhao, Shaoyi Huang, Kiran Thorat, Tong Geng, Chenghong Wang, Xiaolin Xu, Wujie Wen, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growth of Graph Convolution Network (GCN) model sizes has revolutionized
+numerous applications, surpassing human performance in areas such as personal
+healthcare and financial systems. The deployment of GCNs in the cloud raises
+privacy concerns due to potential adversarial attacks on client data. To
+address security concerns, Privacy-Preserving Machine Learning (PPML) using
+Homomorphic Encryption (HE) secures sensitive client data. However, it
+introduces substantial computational overhead in practical applications. To
+tackle those challenges, we present LinGCN, a framework designed to reduce
+multiplication depth and optimize the performance of HE based GCN inference.
+LinGCN is structured around three key elements: (1) A differentiable structural
+linearization algorithm, complemented by a parameterized discrete indicator
+function, co-trained with model weights to meet the optimization goal. This
+strategy promotes fine-grained node-level non-linear location selection,
+resulting in a model with minimized multiplication depth. (2) A compact
+node-wise polynomial replacement policy with a second-order trainable
+activation function, steered towards superior convergence by a two-level
+distillation approach from an all-ReLU based teacher model. (3) an enhanced HE
+solution that enables finer-grained operator fusion for node-wise activation
+functions, further reducing multiplication level consumption in HE-based
+inference. Our experiments on the NTU-XVIEW skeleton joint dataset reveal that
+LinGCN excels in latency, accuracy, and scalability for homomorphically
+encrypted inference, outperforming solutions such as CryptoGCN. Remarkably,
+LinGCN achieves a 14.2x latency speedup relative to CryptoGCN, while preserving
+an inference accuracy of 75% and notably reducing multiplication depth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 accepted publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation Metrics for DNNs Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abanoub Ghobrial, Samuel Budgett, Dieter Balemans, Hamid Asgari, Phil Reiter, Kerstin Eder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a lot of ongoing research effort into developing different
+techniques for neural networks compression. However, the community lacks
+standardised evaluation metrics, which are key to identifying the most suitable
+compression technique for different applications. This paper reviews existing
+neural network compression evaluation metrics and implements them into a
+standardisation framework called NetZIP. We introduce two novel metrics to
+cover existing gaps of evaluation in the literature: 1) Compression and
+Hardware Agnostic Theoretical Speed (CHATS) and 2) Overall Compression Success
+(OCS). We demonstrate the use of NetZIP using two case studies on two different
+hardware platforms (a PC and a Raspberry Pi 4) focusing on object
+classification and object detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NAG-GS: Semi-Implicit, Accelerated and Robust Stochastic Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14937v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14937v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentin Leplat, Daniil Merkulov, Aleksandr Katrutsa, Daniel Bershatsky, Olga Tsymboi, Ivan Oseledets
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical machine learning models such as deep neural networks are usually
+trained by using Stochastic Gradient Descent-based (SGD) algorithms. The
+classical SGD can be interpreted as a discretization of the stochastic gradient
+flow. In this paper we propose a novel, robust and accelerated stochastic
+optimizer that relies on two key elements: (1) an accelerated Nesterov-like
+Stochastic Differential Equation (SDE) and (2) its semi-implicit Gauss-Seidel
+type discretization. The convergence and stability of the obtained method,
+referred to as NAG-GS, are first studied extensively in the case of the
+minimization of a quadratic function. This analysis allows us to come up with
+an optimal learning rate in terms of the convergence rate while ensuring the
+stability of NAG-GS. This is achieved by the careful analysis of the spectral
+radius of the iteration matrix and the covariance matrix at stationarity with
+respect to all hyperparameters of our method. Further, we show that NAG- GS is
+competitive with state-of-the-art methods such as momentum SGD with weight
+decay and AdamW for the training of machine learning models such as the
+logistic regression model, the residual networks models on standard computer
+vision datasets, Transformers in the frame of the GLUE benchmark and the recent
+Vision Transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We study Nesterov acceleration for the Stochastic Differential
+  Equation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Group-Agent Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.05135v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.05135v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyue Wu, Xiao-Jun Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It can largely benefit the reinforcement learning (RL) process of each agent
+if multiple geographically distributed agents perform their separate RL tasks
+cooperatively. Different from multi-agent reinforcement learning (MARL) where
+multiple agents are in a common environment and should learn to cooperate or
+compete with each other, in this case each agent has its separate environment
+and only communicates with others to share knowledge without any cooperative or
+competitive behaviour as a learning outcome. In fact, this scenario exists
+widely in real life whose concept can be utilised in many applications, but is
+not well understood yet and not well formulated. As the first effort, we
+propose group-agent system for RL as a formulation of this scenario and the
+third type of RL system with respect to single-agent and multi-agent systems.
+We then propose a distributed RL framework called DDAL (Decentralised
+Distributed Asynchronous Learning) designed for group-agent reinforcement
+learning (GARL). We show through experiments that DDAL achieved desirable
+performance with very stable training and has good scalability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Learning Framework for End-to-End Imposter Identification in Unseen
+  Speaker Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashutosh Chaubey, Sparsh Sinha, Susmita Ghose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speaker identification systems are deployed in diverse environments, often
+different from the lab conditions on which they are trained and tested. In this
+paper, first, we show the problem of generalization using fixed thresholds
+(computed using EER metric) for imposter identification in unseen speaker
+recognition and then introduce a robust speaker-specific thresholding technique
+for better performance. Secondly, inspired by the recent use of meta-learning
+techniques in speaker verification, we propose an end-to-end meta-learning
+framework for imposter detection which decouples the problem of imposter
+detection from unseen speaker identification. Thus, unlike most prior works
+that use some heuristics to detect imposters, the proposed network learns to
+detect imposters by leveraging the utterances of the enrolled speakers.
+Furthermore, we show the efficacy of the proposed techniques on VoxCeleb1, VCTK
+and the FFSVC 2022 datasets, beating the baselines by up to 10%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fourier neural operator for real-time simulation of 3D dynamic urban
+  microclimate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03985v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03985v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhui Peng, Shaoxiang Qin, Senwen Yang, Jianchun Wang, Xue Liu, Liangzhu Leon Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global urbanization has underscored the significance of urban microclimates
+for human comfort, health, and building/urban energy efficiency. They
+profoundly influence building design and urban planning as major environmental
+impacts. Understanding local microclimates is essential for cities to prepare
+for climate change and effectively implement resilience measures. However,
+analyzing urban microclimates requires considering a complex array of outdoor
+parameters within computational domains at the city scale over a longer period
+than indoors. As a result, numerical methods like Computational Fluid Dynamics
+(CFD) become computationally expensive when evaluating the impact of urban
+microclimates. The rise of deep learning techniques has opened new
+opportunities for accelerating the modeling of complex non-linear interactions
+and system dynamics. Recently, the Fourier Neural Operator (FNO) has been shown
+to be very promising in accelerating solving the Partial Differential Equations
+(PDEs) and modeling fluid dynamic systems. In this work, we apply the FNO
+network for real-time three-dimensional (3D) urban wind field simulation. The
+training and testing data are generated from CFD simulation of the urban area,
+based on the semi-Lagrangian approach and fractional stepping method to
+simulate urban microclimate features for modeling large-scale urban problems.
+Numerical experiments show that the FNO model can accurately reconstruct the
+instantaneous spatial velocity field. We further evaluate the trained FNO model
+on unseen data with different wind directions, and the results show that the
+FNO model can generalize well on different wind directions. More importantly,
+the FNO approach can make predictions within milliseconds on the graphics
+processing unit, making real-time simulation of 3D dynamic urban microclimate
+possible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evolving Curricula with Regret-Based Environment Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01302v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01302v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Parker-Holder, Minqi Jiang, Michael Dennis, Mikayel Samvelyan, Jakob Foerster, Edward Grefenstette, Tim Rocktäschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It remains a significant challenge to train generally capable agents with
+reinforcement learning (RL). A promising avenue for improving the robustness of
+RL agents is through the use of curricula. One such class of methods frames
+environment design as a game between a student and a teacher, using
+regret-based objectives to produce environment instantiations (or levels) at
+the frontier of the student agent's capabilities. These methods benefit from
+their generality, with theoretical guarantees at equilibrium, yet they often
+struggle to find effective levels in challenging design spaces. By contrast,
+evolutionary approaches seek to incrementally alter environment complexity,
+resulting in potentially open-ended learning, but often rely on domain-specific
+heuristics and vast amounts of computational resources. In this paper we
+propose to harness the power of evolution in a principled, regret-based
+curriculum. Our approach, which we call Adversarially Compounding Complexity by
+Editing Levels (ACCEL), seeks to constantly produce levels at the frontier of
+an agent's capabilities, resulting in curricula that start simple but become
+increasingly complex. ACCEL maintains the theoretical benefits of prior
+regret-based methods, while providing significant empirical gains in a diverse
+set of environments. An interactive version of the paper is available at
+accelagent.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting Swarm Equatorial Plasma Bubbles via Machine Learning and
+  Shapley Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.13482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.13482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. A. Reddy, C. Forsyth, A. Aruliah, A. Smith, J. Bortnik, E. Aa, D. O. Kataria, G. Lewis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study we present AI Prediction of Equatorial Plasma Bubbles (APE), a
+machine learning model that can accurately predict the Ionospheric Bubble Index
+(IBI) on the Swarm spacecraft. IBI is a correlation ($R^2$) between
+perturbations in plasma density and the magnetic field, whose source can be
+Equatorial Plasma Bubbles (EPBs). EPBs have been studied for a number of years,
+but their day-to-day variability has made predicting them a considerable
+challenge. We build an ensemble machine learning model to predict IBI. We use
+data from 2014-22 at a resolution of 1sec, and transform it from a time-series
+into a 6-dimensional space with a corresponding EPB $R^2$ (0-1) acting as the
+label. APE performs well across all metrics, exhibiting a skill, association
+and root mean squared error score of 0.96, 0.98 and 0.08 respectively. The
+model performs best post-sunset, in the American/Atlantic sector, around the
+equinoxes, and when solar activity is high. This is promising because EPBs are
+most likely to occur during these periods. Shapley values reveal that F10.7 is
+the most important feature in driving the predictions, whereas latitude is the
+least. The analysis also examines the relationship between the features, which
+reveals new insights into EPB climatology. Finally, the selection of the
+features means that APE could be expanded to forecasting EPBs following
+additional investigations into their onset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 Pages, 9 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Exploration-Exploitation Trade-Off in Active Learning Regression
+  with Bayesian Hierarchical Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07665v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07665v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Upala Junaida Islam, Kamran Paynabar, George Runger, Ashif Sikandar Iquebal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning provides a framework to adaptively query the most informative
+experiments towards learning an unknown black-box function. Various approaches
+of active learning have been proposed in the literature, however, they either
+focus on exploration or exploitation in the design space. Methods that do
+consider exploration-exploitation simultaneously employ fixed or ad-hoc
+measures to control the trade-off that may not be optimal. In this paper, we
+develop a Bayesian hierarchical approach, referred as BHEEM, to dynamically
+balance the exploration-exploitation trade-off as more data points are queried.
+To sample from the posterior distribution of the trade-off parameter, We
+subsequently formulate an approximate Bayesian computation approach based on
+the linear dependence of queried data in the feature space. Simulated and
+real-world examples show the proposed approach achieves at least 21% and 11%
+average improvement when compared to pure exploration and exploitation
+strategies respectively. More importantly, we note that by optimally balancing
+the trade-off between exploration and exploitation, BHEEM performs better or at
+least as well as either pure exploration or pure exploitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 figures, 0 table, submitted to IISE Transaction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Connected Superlevel Set in (Deep) Reinforcement Learning and its
+  Application to Minimax Theorems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12981v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12981v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihan Zeng, Thinh T. Doan, Justin Romberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this paper is to improve the understanding of the optimization
+landscape for policy optimization problems in reinforcement learning.
+Specifically, we show that the superlevel set of the objective function with
+respect to the policy parameter is always a connected set both in the tabular
+setting and under policies represented by a class of neural networks. In
+addition, we show that the optimization objective as a function of the policy
+parameter and reward satisfies a stronger "equiconnectedness" property. To our
+best knowledge, these are novel and previously unknown discoveries.
+  We present an application of the connectedness of these superlevel sets to
+the derivation of minimax theorems for robust reinforcement learning. We show
+that any minimax optimization program which is convex on one side and is
+equiconnected on the other side observes the minimax equality (i.e. has a Nash
+equilibrium). We find that this exact structure is exhibited by an interesting
+robust reinforcement learning problem under an adversarial reward attack, and
+the validity of its minimax equality immediately follows. This is the first
+time such a result is established in the literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy Optimization for Personalized Interventions in Behavioral Health 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jackie Baek, Justin J. Boutilier, Vivek F. Farias, Jonas Oddur Jonasson, Erez Yoeli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Behavioral health interventions, delivered through digital platforms, have
+the potential to significantly improve health outcomes, through education,
+motivation, reminders, and outreach. We study the problem of optimizing
+personalized interventions for patients to maximize a long-term outcome, where
+interventions are costly and capacity-constrained. We assume there exists a
+dataset collected from an initial pilot study that we can leverage. We present
+a new approach for this problem that we dub DecompPI, which approximates one
+step of policy iteration. Implementing DecompPI simply consists of a prediction
+task using the dataset, alleviating the need for online experimentation.
+DecompPI is a generic model-free algorithm that can be used irrespective of the
+underlying patient behavior model. We derive theoretical guarantees on a
+simple, special case of the model that is representative of our problem
+setting. We establish an approximation ratio for DecompPI with respect to the
+improvement beyond a null policy that does not allocate interventions.
+Specifically, when the initial policy used to collect the data is randomized,
+the approximation ratio of the improvement approaches 1/2 as the intervention
+capacity of the initial policy decreases. We show that this guarantee is robust
+to estimation errors. We conduct a rigorous empirical case study using
+real-world data from a mobile health platform for improving treatment adherence
+for tuberculosis. Using a validated simulation model, we demonstrate that
+DecompPI can provide the same efficacy as the status quo approach with
+approximately half the capacity of interventions. DecompPI is simple and easy
+to implement for organizations aiming to improve long-term behavior through
+targeted interventions, and this paper demonstrates its strong performance both
+theoretically and empirically.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics of specialization in neural modules under resource constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.02626v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.02626v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Béna, Dan F. M. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has long been believed that the brain is highly modular both in terms of
+structure and function, although recent evidence has led some to question the
+extent of both types of modularity. We used artificial neural networks to test
+the hypothesis that structural modularity is sufficient to guarantee functional
+specialization, and find that in general, this doesn't necessarily hold except
+at extreme levels. We then systematically tested which features of the
+environment and network do lead to the emergence of specialization. We used a
+simple toy environment, task and network, allowing us precise control, and show
+that in this setup, several distinct measures of specialization give
+qualitatively similar results. We further find that (1) specialization can only
+emerge in environments where features of that environment are meaningfully
+separable, (2) specialization preferentially emerges when the network is
+strongly resource-constrained, and (3) these findings are qualitatively similar
+across different network architectures, but the quantitative relationships
+depends on the architecture type. Finally, we show that functional
+specialization varies dynamically across time, and demonstrate that these
+dynamics depend on both the timing and bandwidth of information flow in the
+network. We conclude that a static notion of specialization, based on
+structural modularity, is likely too simple a framework for understanding
+intelligence in situations of real-world complexity, from biology to
+brain-inspired neuromorphic systems. We propose that thoroughly stress testing
+candidate definitions of functional modularity in simplified scenarios before
+extending to more complex data, network models and electrophysiological
+recordings is likely to be a fruitful approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Support Vector Regression: Risk Quadrangle Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09178v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09178v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Malandii, Stan Uryasev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates Support Vector Regression (SVR) in the context of the
+fundamental risk quadrangle theory, which links optimization, risk management,
+and statistical estimation. It is shown that both formulations of SVR,
+$\varepsilon$-SVR and $\nu$-SVR, correspond to the minimization of equivalent
+error measures (Vapnik error and CVaR norm, respectively) with a regularization
+penalty. These error measures, in turn, define the corresponding risk
+quadrangles. By constructing the fundamental risk quadrangle, which corresponds
+to SVR, we show that SVR is the asymptotically unbiased estimator of the
+average of two symmetric conditional quantiles. Further, we prove the
+equivalence of the $\varepsilon$-SVR and $\nu$-SVR in a general stochastic
+setting. Additionally, SVR is formulated as a regular deviation minimization
+problem with a regularization penalty. Finally, the dual formulation of SVR in
+the risk quadrangle framework is derived.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Incomplete result</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Design of Chain-of-Thought in Math Problem Solving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11054v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11054v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanming Jie, Trung Quoc Luong, Xinbo Zhang, Xiaoran Jin, Hang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) plays a crucial role in reasoning for math problem
+solving. We conduct a comprehensive examination of methods for designing CoT,
+comparing conventional natural language CoT with various program CoTs,
+including the self-describing program, the comment-describing program, and the
+non-describing program. Furthermore, we investigate the impact of programming
+language on program CoTs, comparing Python and Wolfram Language. Through
+extensive experiments on GSM8K, MATHQA, and SVAMP, we find that program CoTs
+often have superior effectiveness in math problem solving. Notably, the best
+performing combination with 30B parameters beats GPT-3.5-turbo by a significant
+margin. The results show that self-describing program offers greater diversity
+and thus can generally achieve higher performance. We also find that Python is
+a better choice of language than Wolfram for program CoTs. The experimental
+results provide a valuable guideline for future CoT designs that take into
+account both programming language and coding style for further advancements.
+Our datasets and code are publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Controllable Inversion of Black-Box Face Recognition Models via
+  Diffusion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Kansy, Anton Raël, Graziana Mignone, Jacek Naruniec, Christopher Schroers, Markus Gross, Romann M. Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition models embed a face image into a low-dimensional identity
+vector containing abstract encodings of identity-specific facial features that
+allow individuals to be distinguished from one another. We tackle the
+challenging task of inverting the latent space of pre-trained face recognition
+models without full model access (i.e. black-box setting). A variety of methods
+have been proposed in literature for this task, but they have serious
+shortcomings such as a lack of realistic outputs and strong requirements for
+the data set and accessibility of the face recognition model. By analyzing the
+black-box inversion problem, we show that the conditional diffusion model loss
+naturally emerges and that we can effectively sample from the inverse
+distribution even without an identity-specific loss. Our method, named identity
+denoising diffusion probabilistic model (ID3PM), leverages the stochastic
+nature of the denoising diffusion process to produce high-quality,
+identity-preserving face images with various backgrounds, lighting, poses, and
+expressions. We demonstrate state-of-the-art performance in terms of identity
+preservation and diversity both qualitatively and quantitatively, and our
+method is the first black-box face recognition model inversion method that
+offers intuitive control over the generation process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages main paper + 23 pages supplementary material. Moderate
+  revisions from v1 (different template, added user study, wording). Presented
+  at AMFG workshop at ICCV 2023. Project page:
+  https://studios.disneyresearch.com/2023/10/02/controllable-inversion-of-black-box-face-recognition-models-via-diffusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Limitless stability for Graph Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11443v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11443v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Koke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work establishes rigorous, novel and widely applicable stability
+guarantees and transferability bounds for graph convolutional networks --
+without reference to any underlying limit object or statistical distribution.
+Crucially, utilized graph-shift operators (GSOs) are not necessarily assumed to
+be normal, allowing for the treatment of networks on both undirected- and for
+the first time also directed graphs. Stability to node-level perturbations is
+related to an 'adequate (spectral) covering' property of the filters in each
+layer. Stability to edge-level perturbations is related to Lipschitz constants
+and newly introduced semi-norms of filters. Results on stability to topological
+perturbations are obtained through recently developed mathematical-physics
+based tools. As an important and novel example, it is showcased that graph
+convolutional networks are stable under graph-coarse-graining procedures
+(replacing strongly-connected sub-graphs by single nodes) precisely if the GSO
+is the graph Laplacian and filters are regular at infinity. These new
+theoretical results are supported by corresponding numerical investigations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-knowledge Inspired Pseudo Supervision (DIPS) for Unsupervised
+  Image-to-Image Translation Models to Support Cross-Domain Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10310v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10310v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Firas Al-Hindawi, Md Mahfuzur Rahman Siddiquee, Teresa Wu, Han Hu, Ying Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to classify images is dependent on having access to large labeled
+datasets and testing on data from the same domain that the model can train on.
+Classification becomes more challenging when dealing with new data from a
+different domain, where gathering and especially labeling a larger image
+dataset for retraining a classification model requires a labor-intensive human
+effort. Cross-domain classification frameworks were developed to handle this
+data domain shift problem by utilizing unsupervised image-to-image translation
+models to translate an input image from the unlabeled domain to the labeled
+domain. The problem with these unsupervised models lies in their unsupervised
+nature. For lack of annotations, it is not possible to use the traditional
+supervised metrics to evaluate these translation models to pick the best-saved
+checkpoint model. This paper introduces a new method called Domain-knowledge
+Inspired Pseudo Supervision (DIPS) which utilizes domain-informed Gaussian
+Mixture Models to generate pseudo annotations to enable the use of traditional
+supervised metrics. This method was designed specifically to support
+cross-domain classification applications contrary to other typically used
+metrics such as the FID which were designed to evaluate the model in terms of
+the quality of the generated image from a human-eye perspective. DIPS proves
+its effectiveness by outperforming various GAN evaluation metrics, including
+FID, when selecting the optimal saved checkpoint model. It is also evaluated
+against truly supervised metrics. Furthermore, DIPS showcases its robustness
+and interpretability by demonstrating a strong correlation with truly
+supervised metrics, highlighting its superiority over existing state-of-the-art
+alternatives. The code and data to replicate the results can be found on the
+official Github repository: https://github.com/Hindawi91/DIPS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2212.09107</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Architectural Compression of Text-to-Image Diffusion Models <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo-Kyeong Kim, Hyoung-Kyu Song, Thibault Castells, Shinkook Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exceptional text-to-image (T2I) generation results of Stable Diffusion models
+(SDMs) come with substantial computational demands. To resolve this issue,
+recent research on efficient SDMs has prioritized reducing the number of
+sampling steps and utilizing network quantization. Orthogonal to these
+directions, this study highlights the power of classical architectural
+compression for general-purpose T2I synthesis by introducing block-removed
+knowledge-distilled SDMs (BK-SDMs). We eliminate several residual and attention
+blocks from the U-Net of SDMs, obtaining over a 30% reduction in the number of
+parameters, MACs per sampling step, and latency. We conduct distillation-based
+pretraining with only 0.22M LAION pairs (fewer than 0.1% of the full training
+pairs) on a single A100 GPU. Despite being trained with limited resources, our
+compact models can imitate the original SDM by benefiting from transferred
+knowledge and achieve competitive results against larger multi-billion
+parameter models on the zero-shot MS-COCO benchmark. Moreover, we demonstrate
+the applicability of our lightweight pretrained models in personalized
+generation with DreamBooth finetuning. Code and models can be found at:
+https://github.com/Nota-NetsPresso/BK-SDM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated results: mobile inference, different training data volumes,
+  and pruning sensitivity analysis; Short version: accepted to ICML Workshop on
+  ES-FoMo (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Sample Complexity of Reinforcement Learning for Mixing
+  Discounted Markov Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07477v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07477v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengbo Wang, Jose Blanchet, Peter Glynn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the optimal sample complexity theory of tabular reinforcement
+learning (RL) for maximizing the infinite horizon discounted reward in a Markov
+decision process (MDP). Optimal worst-case complexity results have been
+developed for tabular RL problems in this setting, leading to a sample
+complexity dependence on $\gamma$ and $\epsilon$ of the form $\tilde
+\Theta((1-\gamma)^{-3}\epsilon^{-2})$, where $\gamma$ denotes the discount
+factor and $\epsilon$ is the solution error tolerance. However, in many
+applications of interest, the optimal policy (or all policies) induces mixing.
+We establish that in such settings, the optimal sample complexity dependence is
+$\tilde \Theta(t_{\text{mix}}(1-\gamma)^{-2}\epsilon^{-2})$, where
+$t_{\text{mix}}$ is the total variation mixing time. Our analysis is grounded
+in regeneration-type ideas, which we believe are of independent interest, as
+they can be used to study RL problems for general state space MDPs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Preserving In-Context Learning for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Wu, Ashwinee Panda, Jiachen T. Wang, Prateek Mittal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) is an important capability of Large Language Models
+(LLMs), enabling these models to dynamically adapt based on specific,
+in-context exemplars, thereby improving accuracy and relevance. However, LLM's
+responses may leak the sensitive private information contained in in-context
+exemplars. To address this challenge, we propose Differentially Private
+In-context Learning (DP-ICL), a general paradigm for privatizing ICL tasks. The
+key idea for DP-ICL paradigm is generating differentially private responses
+through a noisy consensus among an ensemble of LLM's responses based on
+disjoint exemplar sets. Based on the general paradigm of DP-ICL, we instantiate
+several techniques showing how to privatize ICL for text classification and
+language generation. We evaluate DP-ICL on four text classification benchmarks
+and two language generation tasks, and our empirical results show that DP-ICL
+achieves a strong utility-privacy tradeoff.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Pruning for Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10924v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10924v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gongfan Fang, Xinyin Ma, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative modeling has recently undergone remarkable advancements, primarily
+propelled by the transformative implications of Diffusion Probabilistic Models
+(DPMs). The impressive capability of these models, however, often entails
+significant computational overhead during both training and inference. To
+tackle this challenge, we present Diff-Pruning, an efficient compression method
+tailored for learning lightweight diffusion models from pre-existing ones,
+without the need for extensive re-training. The essence of Diff-Pruning is
+encapsulated in a Taylor expansion over pruned timesteps, a process that
+disregards non-contributory diffusion steps and ensembles informative gradients
+to identify important weights. Our empirical assessment, undertaken across
+several datasets highlights two primary benefits of our proposed method: 1)
+Efficiency: it enables approximately a 50\% reduction in FLOPs at a mere 10\%
+to 20\% of the original training expenditure; 2) Consistency: the pruned
+diffusion models inherently preserve generative behavior congruent with their
+pre-trained models. Code is available at
+\url{https://github.com/VainF/Diff-Pruning}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Underlying Scaling Laws and Universal Statistical Structure of
+  Complex <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14975v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14975v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noam Levi, Yaron Oz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study universal traits which emerge both in real-world complex datasets,
+as well as in artificially generated ones. Our approach is to analogize data to
+a physical system and employ tools from statistical physics and Random Matrix
+Theory (RMT) to reveal their underlying structure. We focus on the
+feature-feature covariance matrix, analyzing both its local and global
+eigenvalue statistics. Our main observations are: (i) The power-law scalings
+that the bulk of its eigenvalues exhibit are vastly different for uncorrelated
+normally distributed data compared to real-world data, (ii) this scaling
+behavior can be completely modeled by generating gaussian data with long range
+correlations, (iii) both generated and real-world datasets lie in the same
+universality class from the RMT perspective, as chaotic rather than integrable
+systems, (iv) the expected RMT statistical behavior already manifests for
+empirical covariance matrices at dataset sizes significantly smaller than those
+conventionally used for real-world training, and can be related to the number
+of samples required to approximate the population power-law scaling behavior,
+(v) the Shannon entropy is correlated with local RMT structure and eigenvalues
+scaling, and substantially smaller in strongly correlated datasets compared to
+uncorrelated synthetic data, and requires fewer samples to reach the
+distribution entropy. These findings show that with sufficient sample size, the
+Gram matrix of natural image datasets can be well approximated by a Wishart
+random matrix with a simple covariance structure, opening the door to rigorous
+studies of neural network dynamics and generalization which rely on the data
+Gram matrix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Geometric Perspective on Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19947v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19947v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Defang Chen, Zhenyu Zhou, Jian-Ping Mei, Chunhua Shen, Chun Chen, Can Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed significant progress in developing effective
+training and fast sampling techniques for diffusion models. A remarkable
+advancement is the use of stochastic differential equations (SDEs) and their
+marginal-preserving ordinary differential equations (ODEs) to describe data
+perturbation and generative modeling in a unified framework. In this paper, we
+carefully inspect the ODE-based sampling of a popular variance-exploding SDE
+and reveal several intriguing structures of its sampling dynamics. We discover
+that the data distribution and the noise distribution are smoothly connected
+with a quasi-linear sampling trajectory and another implicit denoising
+trajectory that even converges faster. Meanwhile, the denoising trajectory
+governs the curvature of the corresponding sampling trajectory and its various
+finite differences yield all second-order samplers used in practice.
+Furthermore, we establish a theoretical relationship between the optimal
+ODE-based sampling and the classic mean-shift (mode-seeking) algorithm, with
+which we can characterize the asymptotic behavior of diffusion models and
+identify the empirical score deviation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Marginal Post Processing of Bayesian Inference Products with Normalizing
+  Flows and Kernel Density Estimators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.12841v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.12841v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry T. J. Bevins, William J. Handley, Pablo Lemos, Peter H. Sims, Eloy de Lera Acedo, Anastasia Fialkov, Justin Alsing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian analysis has become an indispensable tool across many different
+cosmological fields including the study of gravitational waves, the Cosmic
+Microwave Background and the 21-cm signal from the Cosmic Dawn among other
+phenomena. The method provides a way to fit complex models to data describing
+key cosmological and astrophysical signals and a whole host of contaminating
+signals and instrumental effects modelled with `nuisance parameters'. In this
+paper, we summarise a method that uses Masked Autoregressive Flows and Kernel
+Density Estimators to learn marginal posterior densities corresponding to core
+science parameters. We find that the marginal or 'nuisance-free' posteriors and
+the associated likelihoods have an abundance of applications including; the
+calculation of previously intractable marginal Kullback-Leibler divergences and
+marginal Bayesian Model Dimensionalities, likelihood emulation and prior
+emulation. We demonstrate each application using toy examples, examples from
+the field of 21-cm cosmology and samples from the Dark Energy Survey. We
+discuss how marginal summary statistics like the Kullback-Leibler divergences
+and Bayesian Model Dimensionalities can be used to examine the constraining
+power of different experiments and how we can perform efficient joint analysis
+by taking advantage of marginal prior and likelihood emulators. We package our
+multipurpose code up in the pip-installable code margarine for use in the wider
+scientific community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for MNRAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Score-based Conditional Generation with Fewer Labeled Data by
+  Self-calibrating Classifier Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Kuo-Ming Huang, Si-An Chen, Hsuan-Tien Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based generative models (SGMs) are a popular family of deep generative
+models that achieve leading image generation quality. Early studies extend SGMs
+to tackle class-conditional generation by coupling an unconditional SGM with
+the guidance of a trained classifier. Nevertheless, such classifier-guided SGMs
+do not always achieve accurate conditional generation, especially when trained
+with fewer labeled data. We argue that the problem is rooted in the
+classifier's tendency to overfit without coordinating with the underlying
+unconditional distribution. We propose improving classifier-guided SGMs by
+letting the classifier regularize itself to respect the unconditional
+distribution. Our key idea is to use principles from energy-based models to
+convert the classifier as another view of the unconditional SGM. Then, existing
+loss for the unconditional SGM can be leveraged to achieve regularization by
+calibrating the classifier's internal unconditional scores. The regularization
+scheme can be applied to not only the labeled data but also unlabeled ones to
+further improve the classifier. Empirical results show that the proposed
+approach significantly improves conditional generation quality across various
+percentages of fewer labeled data. The results confirm the potential of the
+proposed approach for generative modeling with limited labeled data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Fair Classifier Generalization through Adaptive Priority
+  Reweighing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Hu, Yiran Xu, Mengnan Du, Jindong Gu, Xinmei Tian, Fengxiang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing penetration of machine learning applications in critical
+decision-making areas, calls for algorithmic fairness are more prominent.
+Although there have been various modalities to improve algorithmic fairness
+through learning with fairness constraints, their performance does not
+generalize well in the test set. A performance-promising fair algorithm with
+better generalizability is needed. This paper proposes a novel adaptive
+reweighing method to eliminate the impact of the distribution shifts between
+training and test data on model generalizability. Most previous reweighing
+methods propose to assign a unified weight for each (sub)group. Rather, our
+method granularly models the distance from the sample predictions to the
+decision boundary. Our adaptive reweighing method prioritizes samples closer to
+the decision boundary and assigns a higher weight to improve the
+generalizability of fair classifiers. Extensive experiments are performed to
+validate the generalizability of our adaptive priority reweighing method for
+accuracy and fairness measures (i.e., equal opportunity, equalized odds, and
+demographic parity) in tabular benchmarks. We also highlight the performance of
+our method in improving the fairness of language and vision models. The code is
+available at https://github.com/che2198/APW.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Accelerating Diffusion-Based Sampling Process via Improved
+  Integration Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11328v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11328v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqiang Zhang, Niwa Kenta, W. Bastiaan Kleijn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A popular approach to sample a diffusion-based generative model is to solve
+an ordinary differential equation (ODE). In existing samplers, the coefficients
+of the ODE solvers are pre-determined by the ODE formulation, the reverse
+discrete timesteps, and the employed ODE methods. In this paper, we consider
+accelerating several popular ODE-based sampling processes (including EDM, DDIM,
+and DPM-Solver) by optimizing certain coefficients via improved integration
+approximation (IIA). We propose to minimize, for each time step, a mean squared
+error (MSE) function with respect to the selected coefficients. The MSE is
+constructed by applying the original ODE solver for a set of fine-grained
+timesteps, which in principle provides a more accurate integration
+approximation in predicting the next diffusion state. The proposed IIA
+technique does not require any change of a pre-trained model, and only
+introduces a very small computational overhead for solving a number of
+quadratic optimization problems. Extensive experiments show that considerably
+better FID scores can be achieved by using IIA-EDM, IIA-DDIM, and
+IIA-DPM-Solver than the original counterparts when the neural function
+evaluation (NFE) is small (i.e., less than 25).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Agent Deep Reinforcement Learning for Cooperative and Competitive
+  Autonomous Vehicles using AutoDRIVE Ecosystem <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10007v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10007v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Vilas Samak, Chinmay Vilas Samak, Venkat Krovi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a modular and parallelizable multi-agent deep
+reinforcement learning framework for imbibing cooperative as well as
+competitive behaviors within autonomous vehicles. We introduce AutoDRIVE
+Ecosystem as an enabler to develop physically accurate and graphically
+realistic digital twins of Nigel and F1TENTH, two scaled autonomous vehicle
+platforms with unique qualities and capabilities, and leverage this ecosystem
+to train and deploy multi-agent reinforcement learning policies. We first
+investigate an intersection traversal problem using a set of cooperative
+vehicles (Nigel) that share limited state information with each other in single
+as well as multi-agent learning settings using a common policy approach. We
+then investigate an adversarial head-to-head autonomous racing problem using a
+different set of vehicles (F1TENTH) in a multi-agent learning setting using an
+individual policy approach. In either set of experiments, a decentralized
+learning architecture was adopted, which allowed robust training and testing of
+the approaches in stochastic environments, since the agents were mutually
+independent and exhibited asynchronous motion behavior. The problems were
+further aggravated by providing the agents with sparse observation spaces and
+requiring them to sample control commands that implicitly satisfied the imposed
+kinodynamic as well as safety constraints. The experimental results for both
+problem statements are reported in terms of quantitative metrics and
+qualitative remarks for training as well as deployment phases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as Multi-Agent Dynamic Games (MAD-Games) Workshop Paper at
+  IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantically Aligned Task Decomposition in Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Li, Dan Qiao, Baoxiang Wang, Xiangfeng Wang, Bo Jin, Hongyuan Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The difficulty of appropriately assigning credit is particularly heightened
+in cooperative MARL with sparse reward, due to the concurrent time and
+structural scales involved. Automatic subgoal generation (ASG) has recently
+emerged as a viable MARL approach inspired by utilizing subgoals in
+intrinsically motivated reinforcement learning. However, end-to-end learning of
+complex task planning from sparse rewards without prior knowledge, undoubtedly
+requires massive training samples. Moreover, the diversity-promoting nature of
+existing ASG methods can lead to the "over-representation" of subgoals,
+generating numerous spurious subgoals of limited relevance to the actual task
+reward and thus decreasing the sample efficiency of the algorithm. To address
+this problem and inspired by the disentangled representation learning, we
+propose a novel "disentangled" decision-making method, Semantically Aligned
+task decomposition in MARL (SAMA), that prompts pretrained language models with
+chain-of-thought that can suggest potential goals, provide suitable goal
+decomposition and subgoal allocation as well as self-reflection-based
+replanning. Additionally, SAMA incorporates language-grounded RL to train each
+agent's subgoal-conditioned policy. SAMA demonstrates considerable advantages
+in sample efficiency compared to state-of-the-art ASG methods, as evidenced by
+its performance on two challenging sparse-reward tasks, Overcooked and MiniRTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>-Based Length Controlled Generation with Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renlong Jie, Xiaojun Meng, Lifeng Shang, Xin Jiang, Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) like ChatGPT and GPT-4 have attracted great
+attention given their surprising performance on a wide range of NLP tasks.
+Length controlled generation of LLMs emerges as an important topic, which
+enables users to fully leverage the capability of LLMs in more real-world
+scenarios like generating a proper answer or essay of a desired length. In
+addition, the autoregressive generation in LLMs is extremely time-consuming,
+while the ability of controlling this generated length can reduce the inference
+cost by limiting the length. Therefore, we propose a prompt-based length
+control method to achieve high-accuracy length controlled generation. In
+particular, we adopt reinforcement learning with the reward signal given by
+either trainable or rule-based reward models, which further enhances the
+length-control ability of LLMs by rewarding outputs that follows pre-defined
+control instruction. To enable rule-based inference, we also introduce standard
+prompt extractor to collect the standard control information from users' input.
+Experiments show that our method significantly improves the accuracy of
+prompt-based length control for summarization task on popular datasets like
+CNNDM and NYT. Both the standard prompt extractor and the RL-tuned model have
+show strong generalization ability to unseen control prompt templates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data augmentation and refinement for recommender system: A
+  semi-supervised approach using maximum margin matrix factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13050v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13050v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamal Shaikh, Venkateswara Rao Kagita, Vikas Kumar, Arun K Pujari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative filtering (CF) has become a popular method for developing
+recommender systems (RSs) where ratings of a user for new items are predicted
+based on her past preferences and available preference information of other
+users. Despite the popularity of CF-based methods, their performance is often
+greatly limited by the sparsity of observed entries. In this study, we explore
+the data augmentation and refinement aspects of Maximum Margin Matrix
+Factorization (MMMF), a widely accepted CF technique for rating predictions,
+which has not been investigated before. We exploit the inherent characteristics
+of CF algorithms to assess the confidence level of individual ratings and
+propose a semi-supervised approach for rating augmentation based on
+self-training. We hypothesize that any CF algorithm's predictions with low
+confidence are due to some deficiency in the training data and hence, the
+performance of the algorithm can be improved by adopting a systematic data
+augmentation strategy. We iteratively use some of the ratings predicted with
+high confidence to augment the training data and remove low-confidence entries
+through a refinement process. By repeating this process, the system learns to
+improve prediction accuracy. Our method is experimentally evaluated on several
+state-of-the-art CF algorithms and leads to informative rating augmentation,
+improving the performance of the baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comfetch: Federated Learning of Large Networks on Constrained Clients
+  via Sketching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.08346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.08346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tahseen Rabbani, Brandon Feng, Marco Bornstein, Kyle Rui Sang, Yifan Yang, Arjun Rajkumar, Amitabh Varshney, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a popular paradigm for private and collaborative
+model training on the edge. In centralized FL, the parameters of a global
+architecture (such as a deep neural network) are maintained and distributed by
+a central server/controller to clients who transmit model updates (gradients)
+back to the server based on local optimization. While many efforts have focused
+on reducing the communication complexity of gradient transmission, the vast
+majority of compression-based algorithms assume that each participating client
+is able to download and train the current and full set of parameters, which may
+not be a practical assumption depending on the resource constraints of smaller
+clients such as mobile devices. In this work, we propose a simple yet effective
+novel algorithm, Comfetch, which allows clients to train large networks using
+reduced representations of the global architecture via the count sketch, which
+reduces local computational and memory costs along with bi-directional
+communication complexity. We provide a nonconvex convergence guarantee and
+experimentally demonstrate that it is possible to learn large models, such as a
+deep convolutional network, through federated training on their sketched
+counterparts. The resulting global models exhibit competitive test accuracy
+over CIFAR10/100 classification when compared against un-compressed model
+training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlocking the Potential of Deep Learning in Peak-Hour Series Forecasting <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01597v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01597v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenwei Zhang, Xin Wang, Jingyuan Xie, Heling Zhang, Yuantao Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlocking the potential of deep learning in Peak-Hour Series Forecasting
+(PHSF) remains a critical yet underexplored task in various domains. While
+state-of-the-art deep learning models excel in regular Time Series Forecasting
+(TSF), they struggle to achieve comparable results in PHSF. This can be
+attributed to the challenges posed by the high degree of non-stationarity in
+peak-hour series, which makes direct forecasting more difficult than standard
+TSF. Additionally, manually extracting the maximum value from regular
+forecasting results leads to suboptimal performance due to models minimizing
+the mean deficit. To address these issues, this paper presents Seq2Peak, a
+novel framework designed specifically for PHSF tasks, bridging the performance
+gap observed in TSF models. Seq2Peak offers two key components: the CyclicNorm
+pipeline to mitigate the non-stationarity issue and a simple yet effective
+trainable-parameter-free peak-hour decoder with a hybrid loss function that
+utilizes both the original series and peak-hour series as supervised signals.
+Extensive experimentation on publicly available time series datasets
+demonstrates the effectiveness of the proposed framework, yielding a remarkable
+average relative improvement of 37.7% across four real-world datasets for both
+transformer- and non-transformer-based TSF models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in CIKM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auxo: Efficient Federated Learning via Scalable Client Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Liu, Fan Lai, Yinwei Dai, Aditya Akella, Harsha Madhyastha, Mosharaf Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is an emerging machine learning (ML) paradigm that
+enables heterogeneous edge devices to collaboratively train ML models without
+revealing their raw data to a logically centralized server. However, beyond the
+heterogeneous device capacity, FL participants often exhibit differences in
+their data distributions, which are not independent and identically distributed
+(Non-IID). Many existing works present point solutions to address issues like
+slow convergence, low final accuracy, and bias in FL, all stemming from client
+heterogeneity. In this paper, we explore an additional layer of complexity to
+mitigate such heterogeneity by grouping clients with statistically similar data
+distributions (cohorts). We propose Auxo to gradually identify such cohorts in
+large-scale, low-availability, and resource-constrained FL populations. Auxo
+then adaptively determines how to train cohort-specific models in order to
+achieve better model performance and ensure resource efficiency. Our extensive
+evaluations show that, by identifying cohorts with smaller heterogeneity and
+performing efficient cohort-based training, Auxo boosts various existing FL
+solutions in terms of final accuracy (2.1% - 8.2%), convergence time (up to
+2.2x), and model bias (4.8% - 53.8%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLIPS: Federated Learning using Intelligent Participant Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03901v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03901v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Atul Bhope, K. R. Jayaram, Nalini Venkatasubramanian, Ashish Verma, Gegi Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design and implementation of FLIPS, a middleware
+system to manage data and participant heterogeneity in federated learning (FL)
+training workloads. In particular, we examine the benefits of label
+distribution clustering on participant selection in federated learning. FLIPS
+clusters parties involved in an FL training job based on the label distribution
+of their data apriori, and during FL training, ensures that each cluster is
+equitably represented in the participants selected. FLIPS can support the most
+common FL algorithms, including FedAvg, FedProx, FedDyn, FedOpt and FedYogi. To
+manage platform heterogeneity and dynamic resource availability, FLIPS
+incorporates a straggler management mechanism to handle changing capacities in
+distributed, smart community applications. Privacy of label distributions,
+clustering and participant selection is ensured through a trusted execution
+environment (TEE). Our comprehensive empirical evaluation compares FLIPS with
+random participant selection, as well as three other "smart" selection
+mechanisms - Oort, TiFL and gradient clustering using two real-world datasets,
+two benchmark datasets, two different non-IID distributions and three common FL
+algorithms (FedYogi, FedProx and FedAvg). We demonstrate that FLIPS
+significantly improves convergence, achieving higher accuracy by 17 - 20 % with
+20 - 60 % lower communication costs, and these benefits endure in the presence
+of straggler participants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge-Aware Federated Active Learning with Non-IID Data <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13579v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13579v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Tong Cao, Ye Shi, Baosheng Yu, Jingya Wang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning enables multiple decentralized clients to learn
+collaboratively without sharing the local training data. However, the expensive
+annotation cost to acquire data labels on local clients remains an obstacle in
+utilizing local data. In this paper, we propose a federated active learning
+paradigm to efficiently learn a global model with limited annotation budget
+while protecting data privacy in a decentralized learning way. The main
+challenge faced by federated active learning is the mismatch between the active
+sampling goal of the global model on the server and that of the asynchronous
+local clients. This becomes even more significant when data is distributed
+non-IID across local clients. To address the aforementioned challenge, we
+propose Knowledge-Aware Federated Active Learning (KAFAL), which consists of
+Knowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory
+Federated Update (KCFU). KSAS is a novel active sampling method tailored for
+the federated active learning problem. It deals with the mismatch challenge by
+sampling actively based on the discrepancies between local and global models.
+KSAS intensifies specialized knowledge in local clients, ensuring the sampled
+data to be informative for both the local clients and the global model. KCFU,
+in the meantime, deals with the client heterogeneity caused by limited data and
+non-IID data distributions. It compensates for each client's ability in weak
+classes by the assistance of the global model. Extensive experiments and
+analyses are conducted to show the superiority of KSAS over the
+state-of-the-art active learning methods and the efficiency of KCFU under the
+federated active learning framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 12 figures, ICCV23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Heterogeneous Federated Learning with Knowledge Extraction and
+  Multi-Model Fusion <span class="chip">SC 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.07978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.07978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy Phuong Nguyen, Sixing Yu, J. Pablo Muñoz, Ali Jannesari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concerned with user data privacy, this paper presents a new federated
+learning (FL) method that trains machine learning models on edge devices
+without accessing sensitive data. Traditional FL methods, although
+privacy-protective, fail to manage model heterogeneity and incur high
+communication costs due to their reliance on aggregation methods. To address
+this limitation, we propose a resource-aware FL method that aggregates local
+knowledge from edge models and distills it into robust global knowledge through
+knowledge distillation. This method allows efficient multi-model knowledge
+fusion and the deployment of resource-aware models while preserving model
+heterogeneity. Our method improves communication cost and performance in
+heterogeneous data and models compared to existing FL algorithms. Notably, it
+reduces the communication cost of ResNet-32 by up to 50\% and VGG-11 by up to
+10$\times$ while delivering superior performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept at the 4th workshop on Artificial Intelligence and Machine
+  Learning for Scientific Applications (AI4S), SC 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-based Anomaly Detection based on EVT Theory with feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05032v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05032v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyang Liu, Junjie Huang, Yintong Huo, Zhihan Jiang, Jiazhen Gu, Zhuangbin Chen, Cong Feng, Minzhi Yan, Michael R. Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  System logs play a critical role in maintaining the reliability of software
+systems. Fruitful studies have explored automatic log-based anomaly detection
+and achieved notable accuracy on benchmark datasets. However, when applied to
+large-scale cloud systems, these solutions face limitations due to high
+resource consumption and lack of adaptability to evolving logs. In this paper,
+we present an accurate, lightweight, and adaptive log-based anomaly detection
+framework, referred to as SeaLog. Our method introduces a Trie-based Detection
+Agent (TDA) that employs a lightweight, dynamically-growing trie structure for
+real-time anomaly detection. To enhance TDA's accuracy in response to evolving
+log data, we enable it to receive feedback from experts. Interestingly, our
+findings suggest that contemporary large language models, such as ChatGPT, can
+provide feedback with a level of consistency comparable to human experts, which
+can potentially reduce manual verification efforts. We extensively evaluate
+SeaLog on two public datasets and an industrial dataset. The results show that
+SeaLog outperforms all baseline methods in terms of effectiveness, runs 2X to
+10X faster and only consumes 5% to 41% of the memory resource.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuous-in-time Limit for Bayesian Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07513v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07513v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhua Zhu, Zachary Izzo, Lexing Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits the bandit problem in the Bayesian setting. The Bayesian
+approach formulates the bandit problem as an optimization problem, and the goal
+is to find the optimal policy which minimizes the Bayesian regret. One of the
+main challenges facing the Bayesian approach is that computation of the optimal
+policy is often intractable, especially when the length of the problem horizon
+or the number of arms is large. In this paper, we first show that under a
+suitable rescaling, the Bayesian bandit problem converges toward a continuous
+Hamilton-Jacobi-Bellman (HJB) equation. The optimal policy for the limiting HJB
+equation can be explicitly obtained for several common bandit problems, and we
+give numerical methods to solve the HJB equation when an explicit solution is
+not available. Based on these results, we propose an approximate Bayes-optimal
+policy for solving Bayesian bandit problems with large horizons. Our method has
+the added benefit that its computational cost does not increase as the horizon
+increases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Music- and Lyrics-driven Dance Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00455v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00455v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Yin, Qingyuan Yao, Yi Yu, Hang Yin, Danica Kragic, Mårten Björkman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lyrics often convey information about the songs that are beyond the auditory
+dimension, enriching the semantic meaning of movements and musical themes. Such
+insights are important in the dance choreography domain. However, most existing
+dance synthesis methods mainly focus on music-to-dance generation, without
+considering the semantic information. To complement it, we introduce JustLMD, a
+new multimodal dataset of 3D dance motion with music and lyrics. To the best of
+our knowledge, this is the first dataset with triplet information including
+dance motion, music, and lyrics. Additionally, we showcase a cross-modal
+diffusion-based network designed to generate 3D dance motion conditioned on
+music and lyrics. The proposed JustLMD dataset encompasses 4.6 hours of 3D
+dance motion in 1867 sequences, accompanied by musical tracks and their
+corresponding English lyrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Music Generation based on Generative Adversarial Networks with
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Jiang, Ruoxue Wu, Zhenghan Chen, Xiaoxuan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive models based on Transformers have become the prevailing
+approach for generating music compositions that exhibit comprehensive musical
+structure. These models are typically trained by minimizing the negative
+log-likelihood (NLL) of the observed sequence in an autoregressive manner.
+However, when generating long sequences, the quality of samples from these
+models tends to significantly deteriorate due to exposure bias. To address this
+issue, we leverage classifiers trained to differentiate between real and
+sampled sequences to identify these failures. This observation motivates our
+exploration of adversarial losses as a complement to the NLL objective. We
+employ a pre-trained Span-BERT model as the discriminator in the Generative
+Adversarial Network (GAN) framework, which enhances training stability in our
+experiments. To optimize discrete sequences within the GAN framework, we
+utilize the Gumbel-Softmax trick to obtain a differentiable approximation of
+the sampling process. Additionally, we partition the sequences into smaller
+chunks to ensure that memory constraints are met. Through human evaluations and
+the introduction of a novel discriminative metric, we demonstrate that our
+approach outperforms a baseline model trained solely on likelihood
+maximization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>co-author want to withdraw</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2023-10-08T05:20:25.058503359Z">
+            2023-10-08 05:20:25 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`